From edf884172e9828c6234b254208af04655855038d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Dec 2007 11:02:48 +0200 Subject: KVM: Move arch dependent files to new directory arch/x86/kvm/ This paves the way for multiple architecture support. Note that while ioapic.c could potentially be shared with ia64, it is also moved. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1805 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1805 insertions(+) create mode 100644 arch/x86/kvm/mmu.c (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c new file mode 100644 index 000000000000..401eb7ce3207 --- /dev/null +++ b/arch/x86/kvm/mmu.c @@ -0,0 +1,1805 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "vmx.h" +#include "mmu.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#undef MMU_DEBUG + +#undef AUDIT + +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#if defined(MMU_DEBUG) || defined(AUDIT) +static int dbg = 1; +#endif + +#ifndef MMU_DEBUG +#define ASSERT(x) do { } while (0) +#else +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } +#endif + +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_SHIFT 63 +#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ + | PT64_NX_MASK) + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_FETCH_MASK (1U << 4) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define RMAP_EXT 4 + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +struct kvm_rmap_desc { + u64 *shadow_ptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + +static struct kmem_cache *pte_chain_cache; +static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_header_cache; + +static u64 __read_mostly shadow_trap_nonpresent_pte; +static u64 __read_mostly shadow_notrap_nonpresent_pte; + +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) +{ + shadow_trap_nonpresent_pte = trap_pte; + shadow_notrap_nonpresent_pte = notrap_pte; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); + +static int is_write_protection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr0 & X86_CR0_WP; +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_nx(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shadow_efer & EFER_NX; +} + +static int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +static int is_shadow_present_pte(u64 pte) +{ + pte &= ~PT_SHADOW_IO_MARK; + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static int is_writeble_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_dirty_pte(unsigned long pte) +{ + return pte & PT_DIRTY_MASK; +} + +static int is_io_pte(unsigned long pte) +{ + return pte & PT_SHADOW_IO_MARK; +} + +static int is_rmap_pte(u64 pte) +{ + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} + +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + struct kmem_cache *base_cache, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + kfree(mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, + int min) +{ + struct page *page; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + set_page_private(page, 0); + cache->objects[cache->nobjs++] = page_address(page); + } + return 0; +} + +static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + kvm_mmu_free_some_pages(vcpu); + r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, + pte_chain_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, + rmap_desc_cache, 1); + if (r) + goto out; + r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache, 4); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + memset(p, 0, size); + return p; +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_pte_chain *pc) +{ + kfree(pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) +{ + kfree(rd); +} + +/* + * Take gfn and return the reverse mapping to it. + * Note: gfn must be unaliased before this function get called + */ + +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + return &slot->rmap[gfn - slot->base_gfn]; +} + +/* + * Reverse mapping data structures: + * + * If rmapp bit zero is zero, then rmapp point to the shadw page table entry + * that points to page_address(page). + * + * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc + * containing more mappings. + */ +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + struct kvm_rmap_desc *desc; + unsigned long *rmapp; + int i; + + if (!is_rmap_pte(*spte)) + return; + gfn = unalias_gfn(vcpu->kvm, gfn); + sp = page_header(__pa(spte)); + sp->gfns[spte - sp->spt] = gfn; + rmapp = gfn_to_rmap(vcpu->kvm, gfn); + if (!*rmapp) { + rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); + *rmapp = (unsigned long)spte; + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_rmap_desc(vcpu); + desc->shadow_ptes[0] = (u64 *)*rmapp; + desc->shadow_ptes[1] = spte; + *rmapp = (unsigned long)desc | 1; + } else { + rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + desc = desc->more; + if (desc->shadow_ptes[RMAP_EXT-1]) { + desc->more = mmu_alloc_rmap_desc(vcpu); + desc = desc->more; + } + for (i = 0; desc->shadow_ptes[i]; ++i) + ; + desc->shadow_ptes[i] = spte; + } +} + +static void rmap_desc_remove_entry(unsigned long *rmapp, + struct kvm_rmap_desc *desc, + int i, + struct kvm_rmap_desc *prev_desc) +{ + int j; + + for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + ; + desc->shadow_ptes[i] = desc->shadow_ptes[j]; + desc->shadow_ptes[j] = NULL; + if (j != 0) + return; + if (!prev_desc && !desc->more) + *rmapp = (unsigned long)desc->shadow_ptes[0]; + else + if (prev_desc) + prev_desc->more = desc->more; + else + *rmapp = (unsigned long)desc->more | 1; + mmu_free_rmap_desc(desc); +} + +static void rmap_remove(struct kvm *kvm, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + struct kvm_mmu_page *sp; + struct page *page; + unsigned long *rmapp; + int i; + + if (!is_rmap_pte(*spte)) + return; + sp = page_header(__pa(spte)); + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + mark_page_accessed(page); + if (is_writeble_pte(*spte)) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); + if (!*rmapp) { + printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + BUG(); + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + if ((u64 *)*rmapp != spte) { + printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", + spte, *spte); + BUG(); + } + *rmapp = 0; + } else { + rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) + if (desc->shadow_ptes[i] == spte) { + rmap_desc_remove_entry(rmapp, + desc, i, + prev_desc); + return; + } + prev_desc = desc; + desc = desc->more; + } + BUG(); + } +} + +static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + u64 *prev_spte; + int i; + + if (!*rmapp) + return NULL; + else if (!(*rmapp & 1)) { + if (!spte) + return (u64 *)*rmapp; + return NULL; + } + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + prev_spte = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + if (prev_spte == spte) + return desc->shadow_ptes[i]; + prev_spte = desc->shadow_ptes[i]; + } + desc = desc->more; + } + return NULL; +} + +static void rmap_write_protect(struct kvm *kvm, u64 gfn) +{ + unsigned long *rmapp; + u64 *spte; + + gfn = unalias_gfn(kvm, gfn); + rmapp = gfn_to_rmap(kvm, gfn); + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + if (is_writeble_pte(*spte)) + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + kvm_flush_remote_tlbs(kvm); + spte = rmap_next(kvm, rmapp, spte); + } +} + +#ifdef MMU_DEBUG +static int is_empty_shadow_page(u64 *spt) +{ + u64 *pos; + u64 *end; + + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { + printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + pos, *pos); + return 0; + } + return 1; +} +#endif + +static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + ASSERT(is_empty_shadow_page(sp->spt)); + list_del(&sp->link); + __free_page(virt_to_page(sp->spt)); + __free_page(virt_to_page(sp->gfns)); + kfree(sp); + ++kvm->arch.n_free_mmu_pages; +} + +static unsigned kvm_page_table_hashfn(gfn_t gfn) +{ + return gfn; +} + +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte) +{ + struct kvm_mmu_page *sp; + + if (!vcpu->kvm->arch.n_free_mmu_pages) + return NULL; + + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + ASSERT(is_empty_shadow_page(sp->spt)); + sp->slot_bitmap = 0; + sp->multimapped = 0; + sp->parent_pte = parent_pte; + --vcpu->kvm->arch.n_free_mmu_pages; + return sp; +} + +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!parent_pte) + return; + if (!sp->multimapped) { + u64 *old = sp->parent_pte; + + if (!old) { + sp->parent_pte = parent_pte; + return; + } + sp->multimapped = 1; + pte_chain = mmu_alloc_pte_chain(vcpu); + INIT_HLIST_HEAD(&sp->parent_ptes); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = old; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { + if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) + continue; + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) + if (!pte_chain->parent_ptes[i]) { + pte_chain->parent_ptes[i] = parent_pte; + return; + } + } + pte_chain = mmu_alloc_pte_chain(vcpu); + BUG_ON(!pte_chain); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = parent_pte; +} + +static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, + u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->multimapped) { + BUG_ON(sp->parent_pte != parent_pte); + sp->parent_pte = NULL; + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + if (pte_chain->parent_ptes[i] != parent_pte) + continue; + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { + pte_chain->parent_ptes[i] + = pte_chain->parent_ptes[i + 1]; + ++i; + } + pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(pte_chain); + if (hlist_empty(&sp->parent_ptes)) { + sp->multimapped = 0; + sp->parent_pte = NULL; + } + } + return; + } + BUG(); +} + +static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical) { + pgprintk("%s: found role %x\n", + __FUNCTION__, sp->role.word); + return sp; + } + return NULL; +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int metaphysical, + unsigned access, + u64 *parent_pte, + bool *new_page) +{ + union kvm_mmu_page_role role; + unsigned index; + unsigned quadrant; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + role.word = 0; + role.glevels = vcpu->arch.mmu.root_level; + role.level = level; + role.metaphysical = metaphysical; + role.access = access; + if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, + gfn, role.word); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && sp->role.word == role.word) { + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + pgprintk("%s: found\n", __FUNCTION__); + return sp; + } + sp = kvm_mmu_alloc_page(vcpu, parent_pte); + if (!sp) + return sp; + pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); + sp->gfn = gfn; + sp->role = role; + hlist_add_head(&sp->hash_link, bucket); + vcpu->arch.mmu.prefetch_page(vcpu, sp); + if (!metaphysical) + rmap_write_protect(vcpu->kvm, gfn); + if (new_page) + *new_page = 1; + return sp; +} + +static void kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = sp->spt; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (is_shadow_present_pte(pt[i])) + rmap_remove(kvm, &pt[i]); + pt[i] = shadow_trap_nonpresent_pte; + } + kvm_flush_remote_tlbs(kvm); + return; + } + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + pt[i] = shadow_trap_nonpresent_pte; + if (!is_shadow_present_pte(ent)) + continue; + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), &pt[i]); + } + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) +{ + mmu_page_remove_parent_pte(sp, parent_pte); +} + +static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i]) + kvm->vcpus[i]->arch.last_pte_updated = NULL; +} + +static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + u64 *parent_pte; + + ++kvm->stat.mmu_shadow_zapped; + while (sp->multimapped || sp->parent_pte) { + if (!sp->multimapped) + parent_pte = sp->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(sp->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(sp, parent_pte); + set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + } + kvm_mmu_page_unlink_children(kvm, sp); + if (!sp->root_count) { + hlist_del(&sp->hash_link); + kvm_mmu_free_page(kvm, sp); + } else + list_move(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mmu_reset_last_pte_updated(kvm); +} + +/* + * Changing the number of mmu pages allocated to the vm + * Note: if kvm_nr_mmu_pages is too small, you will get dead lock + */ +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +{ + /* + * If we set the number of mmu pages to be smaller be than the + * number of actived pages , we must to free some mmu pages before we + * change the value + */ + + if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > + kvm_nr_mmu_pages) { + int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages + - kvm->arch.n_free_mmu_pages; + + while (n_used_mmu_pages > kvm_nr_mmu_pages) { + struct kvm_mmu_page *page; + + page = container_of(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(kvm, page); + n_used_mmu_pages--; + } + kvm->arch.n_free_mmu_pages = 0; + } + else + kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages + - kvm->arch.n_alloc_mmu_pages; + + kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; +} + +static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node, *n; + int r; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + r = 0; + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical) { + pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + sp->role.word); + kvm_mmu_zap_page(kvm, sp); + r = 1; + } + return r; +} + +static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + + while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + struct kvm_mmu_page *sp = page_header(__pa(pte)); + + __set_bit(slot, &sp->slot_bitmap); +} + +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; + return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, gfn_t gfn) +{ + u64 spte; + int was_rmapped = is_rmap_pte(*shadow_pte); + struct page *page; + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + /* + * We don't set the accessed bit, since we sometimes want to see + * whether the guest actually used the pte (in order to detect + * demand paging). + */ + spte = PT_PRESENT_MASK | PT_DIRTY_MASK; + if (!dirty) + pte_access &= ~ACC_WRITE_MASK; + if (!(pte_access & ACC_EXEC_MASK)) + spte |= PT64_NX_MASK; + + page = gfn_to_page(vcpu->kvm, gfn); + + spte |= PT_PRESENT_MASK; + if (pte_access & ACC_USER_MASK) + spte |= PT_USER_MASK; + + if (is_error_page(page)) { + set_shadow_pte(shadow_pte, + shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); + kvm_release_page_clean(page); + return; + } + + spte |= page_to_phys(page); + + if ((pte_access & ACC_WRITE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + struct kvm_mmu_page *shadow; + + spte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu->kvm, gfn); + goto unshadowed; + } + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + pte_access &= ~ACC_WRITE_MASK; + if (is_writeble_pte(spte)) { + spte &= ~PT_WRITABLE_MASK; + kvm_x86_ops->tlb_flush(vcpu); + } + if (write_fault) + *ptwrite = 1; + } + } + +unshadowed: + + if (pte_access & ACC_WRITE_MASK) + mark_page_dirty(vcpu->kvm, gfn); + + pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); + set_shadow_pte(shadow_pte, spte); + page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + if (!was_rmapped) { + rmap_add(vcpu, shadow_pte, gfn); + if (!is_rmap_pte(*shadow_pte)) + kvm_release_page_clean(page); + } + else + kvm_release_page_clean(page); + if (!ptwrite || !*ptwrite) + vcpu->arch.last_pte_updated = shadow_pte; +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +{ + int level = PT32E_ROOT_LEVEL; + hpa_t table_addr = vcpu->arch.mmu.root_hpa; + int pt_write = 0; + + for (; ; level--) { + u32 index = PT64_INDEX(v, level); + u64 *table; + + ASSERT(VALID_PAGE(table_addr)); + table = __va(table_addr); + + if (level == 1) { + mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, gfn); + return pt_write || is_io_pte(table[index]); + } + + if (table[index] == shadow_trap_nonpresent_pte) { + struct kvm_mmu_page *new_table; + gfn_t pseudo_gfn; + + pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, + v, level - 1, + 1, ACC_ALL, &table[index], + NULL); + if (!new_table) { + pgprintk("nonpaging_map: ENOMEM\n"); + return -ENOMEM; + } + + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + } + table_addr = table[index] & PT64_BASE_ADDR_MASK; + } +} + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static void mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + --sp->root_count; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + } + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + } + vcpu->arch.mmu.root_hpa = INVALID_PAGE; +} + +static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + int i; + gfn_t root_gfn; + struct kvm_mmu_page *sp; + + root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.root_hpa = root; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { + if (!is_present_pte(vcpu->arch.pdptrs[i])) { + vcpu->arch.mmu.pae_root[i] = 0; + continue; + } + root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + } else if (vcpu->arch.mmu.root_level == 0) + root_gfn = 0; + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, !is_paging(vcpu), + ACC_ALL, NULL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + return vaddr; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + gfn_t gfn; + int r; + + pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + gfn = gva >> PAGE_SHIFT; + + return nonpaging_map(vcpu, gva & PAGE_MASK, + error_code & PFERR_WRITE_MASK, gfn); +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->root_level = 0; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + u64 addr, + u32 err_code) +{ + kvm_inject_page_fault(vcpu, addr, err_code); +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->gva_to_gpa = paging64_gva_to_gpa; + context->prefetch_page = paging64_prefetch_page; + context->free = paging_free; + context->root_level = level; + context->shadow_root_level = level; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->prefetch_page = paging32_prefetch_page; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + if (!is_paging(vcpu)) + return nonpaging_init_context(vcpu); + else if (is_long_mode(vcpu)) + return paging64_init_context(vcpu); + else if (is_pae(vcpu)) + return paging32E_init_context(vcpu); + else + return paging32_init_context(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + } +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ + int r; + + mutex_lock(&vcpu->kvm->lock); + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mmu_alloc_roots(vcpu); + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); +out: + mutex_unlock(&vcpu->kvm->lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_shadow_present_pte(pte)) { + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + rmap_remove(vcpu->kvm, spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, spte); + } + } + set_shadow_pte(spte, shadow_trap_nonpresent_pte); +} + +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte, + const void *new, int bytes, + int offset_in_pte) +{ + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + + ++vcpu->kvm->stat.mmu_pte_updated; + if (sp->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + else + paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); +} + +static bool need_remote_flush(u64 old, u64 new) +{ + if (!is_shadow_present_pte(old)) + return false; + if (!is_shadow_present_pte(new)) + return true; + if ((old ^ new) & PT64_BASE_ADDR_MASK) + return true; + old ^= PT64_NX_MASK; + new ^= PT64_NX_MASK; + return (old & ~new & PT64_PERM_MASK) != 0; +} + +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +{ + if (need_remote_flush(old, new)) + kvm_flush_remote_tlbs(vcpu->kvm); + else + kvm_mmu_flush_tlb(vcpu); +} + +static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + return !!(spte && (*spte & PT_ACCESSED_MASK)); +} + +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_mmu_page *sp; + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; + u64 entry; + u64 *spte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + unsigned quadrant; + int level; + int flooded = 0; + int npte; + + pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + ++vcpu->kvm->stat.mmu_pte_write; + kvm_mmu_audit(vcpu, "pre pte write"); + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { + if (sp->gfn != gfn || sp->role.metaphysical) + continue; + pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, sp->role.word); + kvm_mmu_zap_page(vcpu->kvm, sp); + ++vcpu->kvm->stat.mmu_flooded; + continue; + } + page_offset = offset; + level = sp->role.level; + npte = 1; + if (sp->role.glevels == PT32_ROOT_LEVEL) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + npte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != sp->role.quadrant) + continue; + } + spte = &sp->spt[page_offset / sizeof(*spte)]; + while (npte--) { + entry = *spte; + mmu_pte_write_zap_pte(vcpu, sp, spte); + mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, + page_offset & (pte_size - 1)); + mmu_pte_write_flush_tlb(vcpu, entry, *spte); + ++spte; + } + } + kvm_mmu_audit(vcpu, "post pte write"); +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + + return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); +} + +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { + struct kvm_mmu_page *sp; + + sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, sp); + ++vcpu->kvm->stat.mmu_recycled; + } +} + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) +{ + int r; + enum emulation_result er; + + mutex_lock(&vcpu->kvm->lock); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); + if (r < 0) + goto out; + + if (!r) { + r = 1; + goto out; + } + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + + er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); + mutex_unlock(&vcpu->kvm->lock); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++vcpu->stat.mmio_exits; + return 0; + case EMULATE_FAIL: + kvm_report_emulation_failure(vcpu, "pagetable"); + return 1; + default: + BUG(); + } +out: + mutex_unlock(&vcpu->kvm->lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + + while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { + sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, sp); + } + free_page((unsigned long)vcpu->arch.mmu.pae_root); +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct page *page; + int i; + + ASSERT(vcpu); + + if (vcpu->kvm->arch.n_requested_mmu_pages) + vcpu->kvm->arch.n_free_mmu_pages = + vcpu->kvm->arch.n_requested_mmu_pages; + else + vcpu->kvm->arch.n_free_mmu_pages = + vcpu->kvm->arch.n_alloc_mmu_pages; + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. + * Therefore we need to allocate shadow page tables in the first + * 4GB of memory, which happens to fit the DMA32 zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + vcpu->arch.mmu.pae_root = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + + return 0; + +error_1: + free_mmu_pages(vcpu); + return -ENOMEM; +} + +int kvm_mmu_create(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return alloc_mmu_pages(vcpu); +} + +int kvm_mmu_setup(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return init_kvm_mmu(vcpu); +} + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, &sp->slot_bitmap)) + continue; + + pt = sp->spt; + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (pt[i] & PT_WRITABLE_MASK) + pt[i] &= ~PT_WRITABLE_MASK; + } +} + +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + kvm_mmu_zap_page(kvm, sp); + + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_module_exit(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); +} + +int kvm_mmu_module_init(void) +{ + pte_chain_cache = kmem_cache_create("kvm_pte_chain", + sizeof(struct kvm_pte_chain), + 0, 0, NULL); + if (!pte_chain_cache) + goto nomem; + rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", + sizeof(struct kvm_rmap_desc), + 0, 0, NULL); + if (!rmap_desc_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL); + if (!mmu_page_header_cache) + goto nomem; + + return 0; + +nomem: + kvm_mmu_module_exit(); + return -ENOMEM; +} + +/* + * Caculate mmu pages needed for kvm. + */ +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) +{ + int i; + unsigned int nr_mmu_pages; + unsigned int nr_pages = 0; + + for (i = 0; i < kvm->nmemslots; i++) + nr_pages += kvm->memslots[i].npages; + + nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; + nr_mmu_pages = max(nr_mmu_pages, + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + + return nr_mmu_pages; +} + +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) +{ +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (ent == shadow_trap_nonpresent_pte) + continue; + + va = canonicalize(va); + if (level > 1) { + if (ent == shadow_notrap_nonpresent_pte) + printk(KERN_ERR "audit: (%s) nontrapping pte" + " in nonleaf level: levels %d gva %lx" + " level %d pte %llx\n", audit_msg, + vcpu->arch.mmu.root_level, va, level, ent); + + audit_mappings_page(vcpu, ent, va, level - 1); + } else { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + struct page *page = gpa_to_page(vcpu, gpa); + hpa_t hpa = page_to_phys(page); + + if (is_shadow_present_pte(ent) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx %d\n", + audit_msg, vcpu->arch.mmu.root_level, + va, gpa, hpa, ent, + is_shadow_present_pte(ent)); + else if (ent == shadow_notrap_nonpresent_pte + && !is_error_hpa(hpa)) + printk(KERN_ERR "audit: (%s) notrap shadow," + " valid guest gva %lx\n", audit_msg, va); + kvm_release_page_clean(page); + + } + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + unsigned i; + + if (vcpu->arch.mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->arch.mmu.pae_root[i], + i << 30, + 2); +} + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k; + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + unsigned long *rmapp = &m->rmap[j]; + + if (!*rmapp) + continue; + if (!(*rmapp & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->shadow_ptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + return nmaps; +} + +static int count_writable_mappings(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + struct kvm_mmu_page *sp; + int i; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + u64 *pt = sp->spt; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + ++nmaps; + } + } + return nmaps; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + int n_rmap = count_rmaps(vcpu); + int n_actual = count_writable_mappings(vcpu); + + if (n_rmap != n_actual) + printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", + __FUNCTION__, audit_msg, n_rmap, n_actual); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + struct kvm_memory_slot *slot; + unsigned long *rmapp; + gfn_t gfn; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + if (sp->role.metaphysical) + continue; + + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); + gfn = unalias_gfn(vcpu->kvm, sp->gfn); + rmapp = &slot->rmap[gfn - slot->base_gfn]; + if (*rmapp) + printk(KERN_ERR "%s: (%s) shadow page has writable" + " mappings: gfn %lx role %x\n", + __FUNCTION__, audit_msg, sp->gfn, + sp->role.word); + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + audit_mappings(vcpu); + dbg = olddbg; +} + +#endif -- cgit v1.2.3 From caa5b8a5ed27708521e0667649699fea202043b2 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Tue, 18 Dec 2007 06:08:27 +0800 Subject: KVM: MMU: Coalesce remote tlb flushes Host side TLB flush can be merged together if multiple spte need to be write-protected. Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 401eb7ce3207..9a57e1a01449 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -513,6 +513,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; + int write_protected = 0; gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn); @@ -522,11 +523,14 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) + if (is_writeble_pte(*spte)) { set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); - kvm_flush_remote_tlbs(kvm); + write_protected = 1; + } spte = rmap_next(kvm, rmapp, spte); } + if (write_protected) + kvm_flush_remote_tlbs(kvm); } #ifdef MMU_DEBUG -- cgit v1.2.3 From dfc5aa00cbe888d3a9ea97775bbac74cb1c1a1d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 18 Dec 2007 19:47:18 +0200 Subject: KVM: MMU: Add cache miss statistic Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/x86.c | 1 + include/asm-x86/kvm_host.h | 1 + 3 files changed, 3 insertions(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9a57e1a01449..8f12ec52ad86 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -715,6 +715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, pgprintk("%s: found\n", __FUNCTION__); return sp; } + ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return sp; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 733bff65d9a9..fa9e42c03741 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -73,6 +73,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, + { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { NULL } }; diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 28940e1a9713..ced1bebabbc8 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -289,6 +289,7 @@ struct kvm_vm_stat { u32 mmu_pde_zapped; u32 mmu_flooded; u32 mmu_recycled; + u32 mmu_cache_miss; u32 remote_tlb_flush; }; -- cgit v1.2.3 From 10589a4699bb978c781ce73bbae8ca942c5250c9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 20 Dec 2007 19:18:22 -0500 Subject: KVM: MMU: Concurrent guest walkers Do not hold kvm->lock mutex across the entire pagefault code, only acquire it in places where it is necessary, such as mmu hash list, active list, rmap and parent pte handling. Allow concurrent guest walkers by switching walk_addr() to use mmap_sem in read-mode. And get rid of the lockless __gfn_to_page. [avi: move kvm_mmu_pte_write() locking inside the function] [avi: add locking for real mode] [avi: fix cmpxchg locking] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 41 +++++++++++++++++---- arch/x86/kvm/paging_tmpl.h | 8 +++- arch/x86/kvm/vmx.c | 25 +++++++++---- arch/x86/kvm/x86.c | 92 +++++++++++++++++++++++++++------------------- virt/kvm/kvm_main.c | 22 +++-------- 5 files changed, 117 insertions(+), 71 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f12ec52ad86..3b91227969a5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -974,7 +974,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int level = PT32E_ROOT_LEVEL; hpa_t table_addr = vcpu->arch.mmu.root_hpa; @@ -1015,6 +1015,17 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) } } +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +{ + int r; + + mutex_lock(&vcpu->kvm->lock); + r = __nonpaging_map(vcpu, v, write, gfn); + mutex_unlock(&vcpu->kvm->lock); + return r; +} + + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1031,6 +1042,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; + mutex_lock(&vcpu->kvm->lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1038,6 +1050,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + mutex_unlock(&vcpu->kvm->lock); return; } #endif @@ -1051,6 +1064,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -1250,15 +1264,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - mutex_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); if (r) goto out; + mutex_lock(&vcpu->kvm->lock); mmu_alloc_roots(vcpu); + mutex_unlock(&vcpu->kvm->lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: - mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_load); @@ -1353,6 +1367,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int npte; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + mutex_lock(&vcpu->kvm->lock); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); if (gfn == vcpu->arch.last_pt_write_gfn @@ -1421,17 +1436,27 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } kvm_mmu_audit(vcpu, "post pte write"); + mutex_unlock(&vcpu->kvm->lock); } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa; + int r; - return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + up_read(¤t->mm->mmap_sem); + + mutex_lock(&vcpu->kvm->lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + mutex_unlock(&vcpu->kvm->lock); + return r; } void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { + mutex_lock(&vcpu->kvm->lock); while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *sp; @@ -1440,6 +1465,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } + mutex_unlock(&vcpu->kvm->lock); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -1447,7 +1473,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) int r; enum emulation_result er; - mutex_lock(&vcpu->kvm->lock); r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); if (r < 0) goto out; @@ -1462,7 +1487,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) goto out; er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); - mutex_unlock(&vcpu->kvm->lock); switch (er) { case EMULATE_DONE: @@ -1477,7 +1501,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) BUG(); } out: - mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -1574,8 +1597,10 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + mutex_lock(&kvm->lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) kvm_mmu_zap_page(kvm, sp); + mutex_unlock(&kvm->lock); kvm_flush_remote_tlbs(kvm); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 56b88f7e83ef..7f83f5557d5e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -368,11 +368,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (r) return r; + down_read(¤t->mm->mmap_sem); /* * Look up the shadow pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); + up_read(¤t->mm->mmap_sem); /* * The page is not mapped by the guest. Let the guest handle it. @@ -384,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } + mutex_lock(&vcpu->kvm->lock); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, &write_pt); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, @@ -395,11 +398,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * mmio: emulate if accessible, otherwise its a guest fault. */ - if (shadow_pte && is_io_pte(*shadow_pte)) + if (shadow_pte && is_io_pte(*shadow_pte)) { + mutex_unlock(&vcpu->kvm->lock); return 1; + } ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); + mutex_unlock(&vcpu->kvm->lock); return write_pt; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d1167fc303d6..c39493feba46 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1432,27 +1432,34 @@ static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; u16 data = 0; + int ret = 0; int r; + down_read(¤t->mm->mmap_sem); r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); if (r < 0) - return 0; + goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); if (r < 0) - return 0; - return 1; + goto out; + + ret = 1; +out: + up_read(¤t->mm->mmap_sem); + return ret; } static void seg_setup(int seg) @@ -1471,6 +1478,7 @@ static int alloc_apic_access_page(struct kvm *kvm) int r = 0; mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -1482,6 +1490,7 @@ static int alloc_apic_access_page(struct kvm *kvm) goto out; kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: + up_write(¤t->mm->mmap_sem); mutex_unlock(&kvm->lock); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f48ec871035..e3b3141db13c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -181,7 +181,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) int ret; u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { @@ -198,7 +198,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return ret; } @@ -212,13 +212,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return changed; } @@ -278,9 +278,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); @@ -320,9 +318,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); @@ -360,7 +356,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -376,7 +372,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; vcpu->arch.mmu.new_cr3(vcpu); } - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL_GPL(set_cr3); @@ -1211,12 +1207,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; } @@ -1265,7 +1261,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -1279,7 +1275,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, kvm_mmu_zap_all(kvm); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; @@ -1355,7 +1351,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -1371,7 +1367,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, } r = 0; out: - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } @@ -1565,25 +1561,32 @@ int emulator_read_std(unsigned long addr, struct kvm_vcpu *vcpu) { void *data = val; + int r = X86EMUL_CONTINUE; + down_read(¤t->mm->mmap_sem); while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); - if (ret < 0) - return X86EMUL_UNHANDLEABLE; + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } bytes -= tocopy; data += tocopy; addr += tocopy; } - - return X86EMUL_CONTINUE; +out: + up_read(¤t->mm->mmap_sem); + return r; } EXPORT_SYMBOL_GPL(emulator_read_std); @@ -1601,7 +1604,9 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } + down_read(¤t->mm->mmap_sem); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -1617,11 +1622,14 @@ mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1636,10 +1644,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { int ret; + down_read(¤t->mm->mmap_sem); ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) + if (ret < 0) { + up_read(¤t->mm->mmap_sem); return 0; + } kvm_mmu_pte_write(vcpu, gpa, val, bytes); + up_read(¤t->mm->mmap_sem); return 1; } @@ -1649,7 +1661,11 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { struct kvm_io_device *mmio_dev; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); if (gpa == UNMAPPED_GVA) { kvm_inject_page_fault(vcpu, addr, 2); @@ -1667,11 +1683,14 @@ mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1718,11 +1737,14 @@ static int emulator_cmpxchg_emulated(unsigned long addr, #ifndef CONFIG_X86_64 /* guests cmpxchg8b have to be emulated atomically */ if (bytes == 8) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; struct page *page; char *addr; u64 val; + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -1736,8 +1758,9 @@ static int emulator_cmpxchg_emulated(unsigned long addr, set_64bit((u64 *)(addr + offset_in_page(gpa)), val); kunmap_atomic(addr, KM_USER0); kvm_release_page_dirty(page); + emul_write: + up_read(¤t->mm->mmap_sem); } -emul_write: #endif return emulator_write_emulated(addr, new, bytes, vcpu); @@ -2118,10 +2141,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); page = gva_to_page(vcpu, address + i * PAGE_SIZE); vcpu->arch.pio.guest_pages[i] = page; - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); if (!page) { kvm_inject_gp(vcpu, 0); free_pio_guest_pages(vcpu); @@ -2247,7 +2270,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) char instruction[3]; int ret = 0; - mutex_lock(&vcpu->kvm->lock); /* * Blow out the MMU to ensure that no other VCPU has an active mapping @@ -2262,8 +2284,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) != X86EMUL_CONTINUE) ret = -EFAULT; - mutex_unlock(&vcpu->kvm->lock); - return ret; } @@ -2447,8 +2467,10 @@ static void vapic_enter(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; + down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); vcpu->arch.apic->vapic_page = page; + up_read(¤t->mm->mmap_sem); } static void vapic_exit(struct kvm_vcpu *vcpu) @@ -2910,13 +2932,13 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; vcpu_load(vcpu); - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + up_read(¤t->mm->mmap_sem); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - mutex_unlock(&vcpu->kvm->lock); vcpu_put(vcpu); return 0; @@ -3185,13 +3207,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { - down_write(¤t->mm->mmap_sem); memslot->userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - up_write(¤t->mm->mmap_sem); if (IS_ERR((void *)memslot->userspace_addr)) return PTR_ERR((void *)memslot->userspace_addr); @@ -3199,10 +3219,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!old.user_alloc && old.rmap) { int ret; - down_write(¤t->mm->mmap_sem); ret = do_munmap(current->mm, old.userspace_addr, old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4026d7d64296..678e80561b74 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -227,7 +227,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) * * Discontiguous memory is allowed, mostly for framebuffers. * - * Must be called holding kvm->lock. + * Must be called holding mmap_sem for write. */ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -338,9 +338,9 @@ int kvm_set_memory_region(struct kvm *kvm, { int r; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); r = __kvm_set_memory_region(kvm, mem, user_alloc); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); @@ -456,7 +456,7 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) /* * Requires current->mm->mmap_sem to be held */ -static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn) +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { struct page *page[1]; unsigned long addr; @@ -481,17 +481,6 @@ static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn) return page[0]; } -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct page *page; - - down_read(¤t->mm->mmap_sem); - page = __gfn_to_page(kvm, gfn); - up_read(¤t->mm->mmap_sem); - - return page; -} - EXPORT_SYMBOL_GPL(gfn_to_page); void kvm_release_page_clean(struct page *page) @@ -977,8 +966,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) return VM_FAULT_SIGBUS; - /* current->mm->mmap_sem is already held so call lockless version */ - page = __gfn_to_page(kvm, vmf->pgoff); + page = gfn_to_page(kvm, vmf->pgoff); if (is_error_page(page)) { kvm_release_page_clean(page); return VM_FAULT_SIGBUS; -- cgit v1.2.3 From d7824fff896a1698a07a8046dc362f4500c302f7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 30 Dec 2007 12:29:05 +0200 Subject: KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte() Since gfn_to_page() is a sleeping function, and we want to make the core mmu spinlocked, we need to pass the page from the walker context (which can sleep) to the shadow context (which cannot). [marcelo: avoid recursive locking of mmap_sem] Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 55 +++++++++++++++++++++++++++++++++++++++++----- arch/x86/kvm/paging_tmpl.h | 23 ++++++++++++++----- include/asm-x86/kvm_host.h | 5 +++++ 3 files changed, 73 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b91227969a5..c0b757be7b99 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -890,11 +890,10 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn) + int *ptwrite, gfn_t gfn, struct page *page) { u64 spte; int was_rmapped = is_rmap_pte(*shadow_pte); - struct page *page; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -912,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (!(pte_access & ACC_EXEC_MASK)) spte |= PT64_NX_MASK; - page = gfn_to_page(vcpu->kvm, gfn); - spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; @@ -979,6 +976,11 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int level = PT32E_ROOT_LEVEL; hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gfn); + up_read(¤t->mm->mmap_sem); for (; ; level--) { u32 index = PT64_INDEX(v, level); @@ -989,7 +991,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn); + 0, write, 1, &pt_write, gfn, page); return pt_write || is_io_pte(table[index]); } @@ -1005,6 +1007,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) NULL); if (!new_table) { pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_page_clean(page); return -ENOMEM; } @@ -1347,6 +1350,43 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & PT_ACCESSED_MASK)); } +static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn; + int r; + u64 gpte = 0; + + if (bytes != 4 && bytes != 8) + return; + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if (is_pae(vcpu)) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if ((bytes == 4) && (gpa % 4 == 0)) { + r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); + if (r) + return; + memcpy((void *)&gpte + (gpa % 8), new, 4); + } else if ((bytes == 8) && (gpa % 8 == 0)) { + memcpy((void *)&gpte, new, 8); + } + } else { + if ((bytes == 4) && (gpa % 4 == 0)) + memcpy((void *)&gpte, new, 4); + } + if (!is_present_pte(gpte)) + return; + gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + vcpu->arch.update_pte.gfn = gfn; + vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { @@ -1367,6 +1407,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int npte; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); mutex_lock(&vcpu->kvm->lock); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); @@ -1437,6 +1478,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } kvm_mmu_audit(vcpu, "post pte write"); mutex_unlock(&vcpu->kvm->lock); + if (vcpu->arch.update_pte.page) { + kvm_release_page_clean(vcpu->arch.update_pte.page); + vcpu->arch.update_pte.page = NULL; + } } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 136a65d72b0a..3d7846ba26e1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -245,6 +245,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, { pt_element_t gpte; unsigned pte_access; + struct page *npage; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -256,8 +257,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); + if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + return; + npage = vcpu->arch.update_pte.page; + if (!npage) + return; + get_page(npage); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); + gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); } /* @@ -265,7 +272,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *walker, - int user_fault, int write_fault, int *ptwrite) + int user_fault, int write_fault, int *ptwrite, + struct page *page) { hpa_t shadow_addr; int level; @@ -321,8 +329,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, r = kvm_read_guest_atomic(vcpu->kvm, walker->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) + if (r || curr_pte != walker->ptes[level - 2]) { + kvm_release_page_clean(page); return NULL; + } } shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK @@ -333,7 +343,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn); + ptwrite, walker->gfn, page); return shadow_ent; } @@ -362,6 +372,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u64 *shadow_pte; int write_pt = 0; int r; + struct page *page; pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -388,9 +399,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } + page = gfn_to_page(vcpu->kvm, walker.gfn); + mutex_lock(&vcpu->kvm->lock); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - &write_pt); + &write_pt, page); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, shadow_pte, *shadow_pte, write_pt); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 44b89259f6c4..20597bc16744 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -224,6 +224,11 @@ struct kvm_vcpu_arch { int last_pt_write_count; u64 *last_pte_updated; + struct { + gfn_t gfn; /* presumed gfn during guest pte update */ + struct page *page; /* page corresponding to that gfn */ + } update_pte; + struct i387_fxsave_struct host_fx_image; struct i387_fxsave_struct guest_fx_image; -- cgit v1.2.3 From aaee2c94f7a1f7726e360a6cfb40173bd552bcff Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 20 Dec 2007 19:18:26 -0500 Subject: KVM: MMU: Switch to mmu spinlock Convert the synchronization of the shadow handling to a separate mmu_lock spinlock. Also guard fetch() by mmap_sem in read-mode to protect against alias and memslot changes. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 48 +++++++++++++++++++++++++--------------------- arch/x86/kvm/paging_tmpl.h | 10 ++++++---- arch/x86/kvm/vmx.c | 2 -- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 3 +-- 5 files changed, 35 insertions(+), 31 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c0b757be7b99..834698d24595 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -971,16 +971,12 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, + gfn_t gfn, struct page *page) { int level = PT32E_ROOT_LEVEL; hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; - struct page *page; - - down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); for (; ; level--) { u32 index = PT64_INDEX(v, level); @@ -1022,9 +1018,17 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; - mutex_lock(&vcpu->kvm->lock); - r = __nonpaging_map(vcpu, v, write, gfn); - mutex_unlock(&vcpu->kvm->lock); + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gfn); + + spin_lock(&vcpu->kvm->mmu_lock); + r = __nonpaging_map(vcpu, v, write, gfn, page); + spin_unlock(&vcpu->kvm->mmu_lock); + + up_read(¤t->mm->mmap_sem); + return r; } @@ -1045,7 +1049,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1053,7 +1057,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); return; } #endif @@ -1067,7 +1071,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -1270,9 +1274,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); mmu_alloc_roots(vcpu); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: @@ -1408,7 +1412,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); if (gfn == vcpu->arch.last_pt_write_gfn @@ -1477,7 +1481,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } kvm_mmu_audit(vcpu, "post pte write"); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); if (vcpu->arch.update_pte.page) { kvm_release_page_clean(vcpu->arch.update_pte.page); vcpu->arch.update_pte.page = NULL; @@ -1493,15 +1497,15 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); up_read(¤t->mm->mmap_sem); - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); return r; } void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *sp; @@ -1510,7 +1514,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -1642,10 +1646,10 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - mutex_lock(&kvm->lock); + spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) kvm_mmu_zap_page(kvm, sp); - mutex_unlock(&kvm->lock); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 3d7846ba26e1..a35b83a4fef2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -387,7 +387,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); - up_read(¤t->mm->mmap_sem); /* * The page is not mapped by the guest. Let the guest handle it. @@ -396,12 +395,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + up_read(¤t->mm->mmap_sem); return 0; } page = gfn_to_page(vcpu->kvm, walker.gfn); - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, &write_pt, page); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, @@ -414,13 +414,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, * mmio: emulate if accessible, otherwise its a guest fault. */ if (shadow_pte && is_io_pte(*shadow_pte)) { - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); return 1; } ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); return write_pt; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c39493feba46..3d251f894a8d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1477,7 +1477,6 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(&kvm->lock); down_write(¤t->mm->mmap_sem); if (kvm->arch.apic_access_page) goto out; @@ -1491,7 +1490,6 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: up_write(¤t->mm->mmap_sem); - mutex_unlock(&kvm->lock); return r; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a020fb280540..2714068ee8bc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -104,7 +104,8 @@ struct kvm_memory_slot { }; struct kvm { - struct mutex lock; /* protects everything except vcpus */ + struct mutex lock; /* protects the vcpus array and APIC accesses */ + spinlock_t mmu_lock; struct mm_struct *mm; /* userspace tied to this vm */ int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8d0b7c16c2f7..3c4fe26096fc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void) kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); + spin_lock_init(&kvm->mmu_lock); kvm_io_bus_init(&kvm->pio_bus); mutex_init(&kvm->lock); kvm_io_bus_init(&kvm->mmio_bus); @@ -552,9 +553,7 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; - pagefault_disable(); r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); - pagefault_enable(); if (r) return -EFAULT; return 0; -- cgit v1.2.3 From eb787d10af8045dd00d4d4c9a8e90fa495f1b0c1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 31 Dec 2007 15:27:49 +0200 Subject: KVM: MMU: Move kvm_free_some_pages() into critical section If some other cpu steals mmu pages between our check and an attempt to allocate, we can run out of mmu pages. Fix by moving the check into the same critical section as the allocation. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++------ arch/x86/kvm/paging_tmpl.h | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 834698d24595..c478ee25de66 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -291,7 +291,6 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) { int r; - kvm_mmu_free_some_pages(vcpu); r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache, 4); if (r) @@ -569,9 +568,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, { struct kvm_mmu_page *sp; - if (!vcpu->kvm->arch.n_free_mmu_pages) - return NULL; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); @@ -1024,6 +1020,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) page = gfn_to_page(vcpu->kvm, gfn); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); r = __nonpaging_map(vcpu, v, write, gfn, page); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1275,6 +1272,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); @@ -1413,6 +1411,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); if (gfn == vcpu->arch.last_pt_write_gfn @@ -1505,7 +1504,6 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - spin_lock(&vcpu->kvm->mmu_lock); while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *sp; @@ -1514,7 +1512,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } - spin_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a35b83a4fef2..349920556be3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -402,6 +402,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, page = gfn_to_page(vcpu->kvm, walker.gfn); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, &write_pt, page); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, -- cgit v1.2.3 From 75e68e607896c84310dee37c783c45220e56ce8c Mon Sep 17 00:00:00 2001 From: Izik Eidus Date: Sat, 12 Jan 2008 23:49:09 +0200 Subject: KVM: MMU: Fix dirty page setting for pages removed from rmap Right now rmap_remove won't set the page as dirty if the shadow pte pointed to this page had write access and then it became readonly. This patches fixes that, by setting the page as dirty for spte changes from write to readonly access. Signed-off-by: Izik Eidus Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c478ee25de66..8efdcdbebb03 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -890,6 +890,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, { u64 spte; int was_rmapped = is_rmap_pte(*shadow_pte); + int was_writeble = is_writeble_pte(*shadow_pte); pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -956,9 +957,12 @@ unshadowed: rmap_add(vcpu, shadow_pte, gfn); if (!is_rmap_pte(*shadow_pte)) kvm_release_page_clean(page); + } else { + if (was_writeble) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); } - else - kvm_release_page_clean(page); if (!ptwrite || !*ptwrite) vcpu->arch.last_pte_updated = shadow_pte; } -- cgit v1.2.3