From e925c5ba9380dad5fdf1d0a9d9199ac43be74c6a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 14:47:02 +0300 Subject: KVM: Assume that writes smaller than 4 bytes are to non-pagetable pages This allows us to remove write protection earlier than otherwise. Should some mad OS choose to use byte writes to update pagetables, it will suffer a performance hit, but still work correctly. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index e8e228118de9..2277b7cd118c 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1169,6 +1169,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) continue; pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; if (misaligned || flooded) { /* * Misaligned accesses are too much trouble to fix -- cgit v1.2.3 From 09072daf37abbfe8e2d5018dd913f229c76190f7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 14:16:52 +0300 Subject: KVM: Unify kvm_mmu_pre_write() and kvm_mmu_post_write() Instead of calling two functions and repeating expensive checks, call one function and provide it with before/after information. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 ++-- drivers/kvm/kvm_main.c | 4 ++-- drivers/kvm/mmu.c | 11 ++++------- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 7facebd1911d..11c519e8085a 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -525,8 +525,8 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, unsigned long segment_base(u16 selector); -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7d682586423b..b6ad9c6f2efe 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1071,18 +1071,18 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { struct page *page; void *virt; + unsigned offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); if (!page) return 0; - kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); - kvm_mmu_post_write(vcpu, gpa, bytes); return 1; } diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2277b7cd118c..b3a83ef2cf07 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1118,7 +1118,7 @@ out: return r; } -static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *spte) { @@ -1137,7 +1137,8 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, *spte = 0; } -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; @@ -1206,16 +1207,12 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); while (npte--) { - mmu_pre_write_zap_pte(vcpu, page, spte); + mmu_pte_write_zap_pte(vcpu, page, spte); ++spte; } } } -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) -{ -} - int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); -- cgit v1.2.3 From fce0657ff9f14f6b1f147b5fcd6db2f54c06424e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 16:44:05 +0300 Subject: KVM: MMU: Respect nonpae pagetable quadrant when zapping ptes When a guest writes to a page that has an mmu shadow, we have to clear the shadow pte corresponding to the memory location touched by the guest. Now, in nonpae mode, a single guest page may have two or four shadow pages (because a nonpae page maps 4MB or 4GB, whereas the pae shadow maps 2MB or 1GB), so we when we look up the page we find up to three additional aliases for the page. Since we _clear_ the shadow pte, it doesn't matter except for a slight performance penalty, but if we want to _update_ the shadow pte instead of clearing it, it is vital that we don't modify the aliases. Fortunately, exactly which page is needed (the "quadrant") is easily computed, and is accessible in the shadow page header. All we need is to ignore shadow pages from the wrong quadrants. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index b3a83ef2cf07..23dc4612026b 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1150,6 +1150,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned pte_size; unsigned page_offset; unsigned misaligned; + unsigned quadrant; int level; int flooded = 0; int npte; @@ -1202,7 +1203,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, page_offset <<= 1; npte = 2; } + quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; + if (quadrant != page->role.quadrant) + continue; } spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); -- cgit v1.2.3 From 0028425f647b6b78a0de8810d6b782fc3ce6c272 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 16:53:31 +0300 Subject: KVM: Update shadow pte on write to guest pte A typical demand page/copy on write pattern is: - page fault on vaddr - kvm propagates fault to guest - guest handles fault, updates pte - kvm traps write, clears shadow pte, resumes guest - guest returns to userspace, re-faults on same vaddr - kvm installs shadow pte, resumes guest - guest continues So, three vmexits for a single guest page fault. But if instead of clearing the page table entry, we update to correspond to the value that the guest has just written, we eliminate the third vmexit. This patch does exactly that, reducing kbuild time by about 10%. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 15 +++++++++++++++ drivers/kvm/paging_tmpl.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 23dc4612026b..9ec3df90dbb8 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1137,6 +1137,20 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, *spte = 0; } +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte, + const void *new, int bytes) +{ + if (page->role.level != PT_PAGE_TABLE_LEVEL) + return; + + if (page->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, page, spte, new, bytes); + else + paging64_update_pte(vcpu, page, spte, new, bytes); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *old, const u8 *new, int bytes) { @@ -1212,6 +1226,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, spte += page_offset / sizeof(*spte); while (npte--) { mmu_pte_write_zap_pte(vcpu, page, spte); + mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); ++spte; } } diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index bc64cceec039..10ba0a80ce59 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -202,6 +202,21 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, guest_pte & PT_DIRTY_MASK, access_bits, gfn); } +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes) +{ + pt_element_t gpte; + + if (bytes < sizeof(pt_element_t)) + return; + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + FNAME(set_pte)(vcpu, gpte, spte, 6, + (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); +} + static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { -- cgit v1.2.3 From 4b02d6daa12465b209ec4f50c363f9553a51f45b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 15:36:30 +0300 Subject: KVM: MMU: Simplify kvm_mmu_free_page() a tiny bit Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 9ec3df90dbb8..a96c9ae54f3c 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -455,12 +455,10 @@ static int is_empty_shadow_page(hpa_t page_hpa) } #endif -static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) +static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page_head) { - struct kvm_mmu_page *page_head = page_header(page_hpa); - - ASSERT(is_empty_shadow_page(page_hpa)); - page_head->page_hpa = page_hpa; + ASSERT(is_empty_shadow_page(page_head->page_hpa)); list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -690,7 +688,7 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, kvm_mmu_page_unlink_children(vcpu, page); if (!page->root_count) { hlist_del(&page->hash_link); - kvm_mmu_free_page(vcpu, page->page_hpa); + kvm_mmu_free_page(vcpu, page); } else list_move(&page->link, &vcpu->kvm->active_mmu_pages); } -- cgit v1.2.3 From 47ad8e689b4f94f9fc3b2588a7aaa65e4eca667c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 15:50:58 +0300 Subject: KVM: MMU: Store shadow page tables as kernel virtual addresses, not physical Simpifies things a bit. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- drivers/kvm/mmu.c | 32 +++++++++++++++----------------- drivers/kvm/paging_tmpl.h | 2 +- 3 files changed, 17 insertions(+), 19 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 5e6dac5a3c00..fc4a6c1235f0 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -139,7 +139,7 @@ struct kvm_mmu_page { gfn_t gfn; union kvm_mmu_page_role role; - hpa_t page_hpa; + u64 *spt; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index a96c9ae54f3c..c85c6649280e 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -439,13 +439,12 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } #ifdef MMU_DEBUG -static int is_empty_shadow_page(hpa_t page_hpa) +static int is_empty_shadow_page(u64 *spt) { u64 *pos; u64 *end; - for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); - pos != end; pos++) + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) if (*pos != 0) { printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, pos, *pos); @@ -458,7 +457,7 @@ static int is_empty_shadow_page(hpa_t page_hpa) static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page_head) { - ASSERT(is_empty_shadow_page(page_head->page_hpa)); + ASSERT(is_empty_shadow_page(page_head->spt)); list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -478,7 +477,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_move(&page->link, &vcpu->kvm->active_mmu_pages); - ASSERT(is_empty_shadow_page(page->page_hpa)); + ASSERT(is_empty_shadow_page(page->spt)); page->slot_bitmap = 0; page->multimapped = 0; page->parent_pte = parent_pte; @@ -636,7 +635,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, u64 *pt; u64 ent; - pt = __va(page->page_hpa); + pt = page->spt; if (page->role.level == PT_PAGE_TABLE_LEVEL) { for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -803,7 +802,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) return -ENOMEM; } - table[index] = new_table->page_hpa | PT_PRESENT_MASK + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; } table_addr = table[index] & PT64_BASE_ADDR_MASK; @@ -855,7 +854,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); page = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.root_hpa = root; return; @@ -876,7 +875,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; } @@ -1220,8 +1219,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != page->role.quadrant) continue; } - spte = __va(page->page_hpa); - spte += page_offset / sizeof(*spte); + spte = &page->spt[page_offset / sizeof(*spte)]; while (npte--) { mmu_pte_write_zap_pte(vcpu, page, spte); mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); @@ -1262,8 +1260,8 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_del(&page->link); - __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); - page->page_hpa = INVALID_PAGE; + free_page((unsigned long)page->spt); + page->spt = NULL; } free_page((unsigned long)vcpu->mmu.pae_root); } @@ -1282,8 +1280,8 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if ((page = alloc_page(GFP_KERNEL)) == NULL) goto error_1; set_page_private(page, (unsigned long)page_header); - page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; - memset(__va(page_header->page_hpa), 0, PAGE_SIZE); + page_header->spt = page_address(page); + memset(page_header->spt, 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -1346,7 +1344,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) if (!test_bit(slot, &page->slot_bitmap)) continue; - pt = __va(page->page_hpa); + pt = page->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ if (pt[i] & PT_WRITABLE_MASK) { @@ -1497,7 +1495,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) int i; list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { - u64 *pt = __va(page->page_hpa); + u64 *pt = page->spt; if (page->role.level != PT_PAGE_TABLE_LEVEL) continue; diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 10ba0a80ce59..6dd0da9a5d15 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -304,7 +304,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, metaphysical, hugepage_access, shadow_ent); - shadow_addr = shadow_page->page_hpa; + shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; *shadow_ent = shadow_pte; -- cgit v1.2.3 From d3d25b048b9c7e5c1c20918157a71df734f71766 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 May 2007 12:34:53 +0300 Subject: KVM: MMU: Use slab caches for shadow pages and their headers Use slab caches instead of a simple custom list. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 ++-- drivers/kvm/kvm_main.c | 1 - drivers/kvm/mmu.c | 64 ++++++++++++++++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 28 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 90001b5a0253..199e1e9bae25 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -299,12 +299,12 @@ struct kvm_vcpu { struct vmx_msr_entry *guest_msrs; struct vmx_msr_entry *host_msrs; - struct list_head free_pages; - struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; struct kvm_mmu mmu; struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_page_header_cache; gfn_t last_pt_write_gfn; int last_pt_write_count; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index af07cd539bba..bf35457ce377 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -326,7 +326,6 @@ static struct kvm *kvm_create_vm(void) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; - INIT_LIST_HEAD(&vcpu->free_pages); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index c85c6649280e..46491b4cd859 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -165,6 +165,8 @@ struct kvm_rmap_desc { static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_cache; +static struct kmem_cache *mmu_page_header_cache; static int is_write_protection(struct kvm_vcpu *vcpu) { @@ -235,6 +237,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, rmap_desc_cache, 1, gfp_flags); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_cache, + mmu_page_cache, 4, gfp_flags); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, + mmu_page_header_cache, 4, gfp_flags); out: return r; } @@ -258,6 +268,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->mmu_page_cache); + mmu_free_memory_cache(&vcpu->mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -458,7 +470,9 @@ static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page_head) { ASSERT(is_empty_shadow_page(page_head->spt)); - list_move(&page_head->link, &vcpu->free_pages); + list_del(&page_head->link); + mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt); + mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head); ++vcpu->kvm->n_free_mmu_pages; } @@ -472,11 +486,14 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, { struct kvm_mmu_page *page; - if (list_empty(&vcpu->free_pages)) + if (!vcpu->kvm->n_free_mmu_pages) return NULL; - page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); - list_move(&page->link, &vcpu->kvm->active_mmu_pages); + page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, + sizeof *page); + page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(page->spt), (unsigned long)page); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); ASSERT(is_empty_shadow_page(page->spt)); page->slot_bitmap = 0; page->multimapped = 0; @@ -1083,6 +1100,7 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + mmu_topup_memory_caches(vcpu); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) @@ -1256,13 +1274,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu, page); } - while (!list_empty(&vcpu->free_pages)) { - page = list_entry(vcpu->free_pages.next, - struct kvm_mmu_page, link); - list_del(&page->link); - free_page((unsigned long)page->spt); - page->spt = NULL; - } free_page((unsigned long)vcpu->mmu.pae_root); } @@ -1273,18 +1284,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { - struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; - - INIT_LIST_HEAD(&page_header->link); - if ((page = alloc_page(GFP_KERNEL)) == NULL) - goto error_1; - set_page_private(page, (unsigned long)page_header); - page_header->spt = page_address(page); - memset(page_header->spt, 0, PAGE_SIZE); - list_add(&page_header->link, &vcpu->free_pages); - ++vcpu->kvm->n_free_mmu_pages; - } + vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. @@ -1309,7 +1309,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - ASSERT(list_empty(&vcpu->free_pages)); return alloc_mmu_pages(vcpu); } @@ -1318,7 +1317,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - ASSERT(!list_empty(&vcpu->free_pages)); return init_kvm_mmu(vcpu); } @@ -1377,6 +1375,10 @@ void kvm_mmu_module_exit(void) kmem_cache_destroy(pte_chain_cache); if (rmap_desc_cache) kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_cache) + kmem_cache_destroy(mmu_page_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) @@ -1392,6 +1394,18 @@ int kvm_mmu_module_init(void) if (!rmap_desc_cache) goto nomem; + mmu_page_cache = kmem_cache_create("kvm_mmu_page", + PAGE_SIZE, + PAGE_SIZE, 0, NULL, NULL); + if (!mmu_page_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL, NULL); + if (!mmu_page_header_cache) + goto nomem; + return 0; nomem: -- cgit v1.2.3 From e60d75ea292071e7ab33c10ca73fdd33fcbbe501 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 May 2007 19:31:17 +0300 Subject: KVM: MMU: Move set_pte_common() to pte width dependent code In preparation of some modifications. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 48 ---------------------------------------- drivers/kvm/paging_tmpl.h | 56 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 52 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 46491b4cd859..a7631502f22b 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -965,54 +965,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } -static inline void set_pte_common(struct kvm_vcpu *vcpu, - u64 *shadow_pte, - gpa_t gaddr, - int dirty, - u64 access_bits, - gfn_t gfn) -{ - hpa_t paddr; - - *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; - if (!dirty) - access_bits &= ~PT_WRITABLE_MASK; - - paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - - *shadow_pte |= access_bits; - - if (is_error_hpa(paddr)) { - *shadow_pte |= gaddr; - *shadow_pte |= PT_SHADOW_IO_MARK; - *shadow_pte &= ~PT_PRESENT_MASK; - return; - } - - *shadow_pte |= paddr; - - if (access_bits & PT_WRITABLE_MASK) { - struct kvm_mmu_page *shadow; - - shadow = kvm_mmu_lookup_page(vcpu, gfn); - if (shadow) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - access_bits &= ~PT_WRITABLE_MASK; - if (is_writeble_pte(*shadow_pte)) { - *shadow_pte &= ~PT_WRITABLE_MASK; - kvm_arch_ops->tlb_flush(vcpu); - } - } - } - - if (access_bits & PT_WRITABLE_MASK) - mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); - - page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - rmap_add(vcpu, shadow_pte); -} - static void inject_page_fault(struct kvm_vcpu *vcpu, u64 addr, u32 err_code) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index e094a8ba17a8..65763007f04d 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -192,14 +192,62 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); } +static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, + u64 *shadow_pte, + gpa_t gaddr, + int dirty, + u64 access_bits, + gfn_t gfn) +{ + hpa_t paddr; + + *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; + if (!dirty) + access_bits &= ~PT_WRITABLE_MASK; + + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); + + *shadow_pte |= access_bits; + + if (is_error_hpa(paddr)) { + *shadow_pte |= gaddr; + *shadow_pte |= PT_SHADOW_IO_MARK; + *shadow_pte &= ~PT_PRESENT_MASK; + return; + } + + *shadow_pte |= paddr; + + if (access_bits & PT_WRITABLE_MASK) { + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + access_bits &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(*shadow_pte)) { + *shadow_pte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); + } + } + } + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + rmap_add(vcpu, shadow_pte); +} + static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { ASSERT(*shadow_pte == 0); access_bits &= guest_pte; *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); - set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, - guest_pte & PT_DIRTY_MASK, access_bits, gfn); + FNAME(set_pte_common)(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, + guest_pte & PT_DIRTY_MASK, access_bits, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -229,8 +277,8 @@ static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); *shadow_pte = guest_pde & PT_PTE_COPY_MASK; - set_pte_common(vcpu, shadow_pte, gaddr, - guest_pde & PT_DIRTY_MASK, access_bits, gfn); + FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, + guest_pde & PT_DIRTY_MASK, access_bits, gfn); } /* -- cgit v1.2.3 From 63b1ad24d2695db3ec1cc8b10760e130e1a1f04b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 11:56:54 +0300 Subject: KVM: MMU: Fold fix_read_pf() into set_pte_common() Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 17 ----------------- drivers/kvm/paging_tmpl.h | 34 +++++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 28 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index a7631502f22b..2079d69f186a 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -972,23 +972,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); } -static inline int fix_read_pf(u64 *shadow_ent) -{ - if ((*shadow_ent & PT_SHADOW_USER_MASK) && - !(*shadow_ent & PT_USER_MASK)) { - /* - * If supervisor write protect is disabled, we shadow kernel - * pages as user pages so we can trap the write access. - */ - *shadow_ent |= PT_USER_MASK; - *shadow_ent &= ~PT_WRITABLE_MASK; - - return 1; - - } - return 0; -} - static void paging_free(struct kvm_vcpu *vcpu) { nonpaging_free(vcpu); diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 7e998d193849..869582befaf1 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -197,6 +197,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, gpa_t gaddr, pt_element_t *gpte, u64 access_bits, + int write_fault, gfn_t gfn) { hpa_t paddr; @@ -219,6 +220,17 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, *shadow_pte |= paddr; + if (!write_fault && (*shadow_pte & PT_SHADOW_USER_MASK) && + !(*shadow_pte & PT_USER_MASK)) { + /* + * If supervisor write protect is disabled, we shadow kernel + * pages as user pages so we can trap the write access. + */ + *shadow_pte |= PT_USER_MASK; + *shadow_pte &= ~PT_WRITABLE_MASK; + access_bits &= ~PT_WRITABLE_MASK; + } + if (access_bits & PT_WRITABLE_MASK) { struct kvm_mmu_page *shadow; @@ -242,13 +254,14 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, } static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, - u64 *shadow_pte, u64 access_bits, gfn_t gfn) + u64 *shadow_pte, u64 access_bits, + int write_fault, gfn_t gfn) { ASSERT(*shadow_pte == 0); access_bits &= *gpte; *shadow_pte = (*gpte & PT_PTE_COPY_MASK); FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, - gpte, access_bits, gfn); + gpte, access_bits, write_fault, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -262,12 +275,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, + FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, - u64 *shadow_pte, u64 access_bits, gfn_t gfn) + u64 *shadow_pte, u64 access_bits, int write_fault, + gfn_t gfn) { gpa_t gaddr; @@ -279,14 +293,14 @@ static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, (32 - PT32_DIR_PSE36_SHIFT); *shadow_pte = *gpde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, - gpde, access_bits, gfn); + gpde, access_bits, write_fault, gfn); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker) + struct guest_walker *walker, int write_fault) { hpa_t shadow_addr; int level; @@ -351,12 +365,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (prev_shadow_ent) *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, guest_ent, shadow_ent, - walker->inherited_ar, walker->gfn); + walker->inherited_ar, write_fault, walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); FNAME(set_pte)(vcpu, guest_ent, shadow_ent, walker->inherited_ar, - walker->gfn); + write_fault, walker->gfn); } return shadow_ent; } @@ -489,7 +503,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - shadow_pte = FNAME(fetch)(vcpu, addr, &walker); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, write_fault); pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, shadow_pte, *shadow_pte); @@ -499,8 +513,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (write_fault) fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, user_fault, &write_pt); - else - fixed = fix_read_pf(shadow_pte); pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, shadow_pte, *shadow_pte); -- cgit v1.2.3 From 97a0a01ea9229e4f3f0f06e0584227e9687159a5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 15:08:29 +0300 Subject: KVM: MMU: Fold fix_write_pf() into set_pte_common() This prevents some work from being performed twice, and, more importantly, reduces the number of places where we modify shadow ptes. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 11 +++ drivers/kvm/paging_tmpl.h | 168 ++++++++++++++++------------------------------ 2 files changed, 68 insertions(+), 111 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2079d69f186a..3cdbf687df25 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -731,6 +731,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) return r; } +static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *page; + + while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", + __FUNCTION__, gfn, page->role.word); + kvm_mmu_zap_page(vcpu, page); + } +} + static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) { int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 869582befaf1..c06720385551 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -197,11 +197,26 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, gpa_t gaddr, pt_element_t *gpte, u64 access_bits, + int user_fault, int write_fault, + int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { hpa_t paddr; int dirty = *gpte & PT_DIRTY_MASK; + int was_rmapped = is_rmap_pte(*shadow_pte); + + pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, *shadow_pte, (u64)*gpte, access_bits, + write_fault, user_fault, gfn); + + if (write_fault && !dirty) { + *gpte |= PT_DIRTY_MASK; + dirty = 1; + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); + } *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) @@ -209,7 +224,9 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - *shadow_pte |= access_bits; + *shadow_pte |= PT_PRESENT_MASK; + if (access_bits & PT_USER_MASK) + *shadow_pte |= PT_USER_MASK; if (is_error_hpa(paddr)) { *shadow_pte |= gaddr; @@ -231,37 +248,50 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, access_bits &= ~PT_WRITABLE_MASK; } - if (access_bits & PT_WRITABLE_MASK) { + if ((access_bits & PT_WRITABLE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { struct kvm_mmu_page *shadow; + *shadow_pte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu, gfn); + goto unshadowed; + } + shadow = kvm_mmu_lookup_page(vcpu, gfn); if (shadow) { pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); access_bits &= ~PT_WRITABLE_MASK; if (is_writeble_pte(*shadow_pte)) { - *shadow_pte &= ~PT_WRITABLE_MASK; - kvm_arch_ops->tlb_flush(vcpu); + *shadow_pte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); } + if (write_fault) + *ptwrite = 1; } } +unshadowed: + if (access_bits & PT_WRITABLE_MASK) mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - rmap_add(vcpu, shadow_pte); + if (!was_rmapped) + rmap_add(vcpu, shadow_pte); } static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, u64 *shadow_pte, u64 access_bits, - int write_fault, gfn_t gfn) + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { - ASSERT(*shadow_pte == 0); access_bits &= *gpte; - *shadow_pte = (*gpte & PT_PTE_COPY_MASK); + *shadow_pte |= (*gpte & PT_PTE_COPY_MASK); FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, - gpte, access_bits, write_fault, gfn); + gpte, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -276,31 +306,34 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, + 0, NULL, NULL, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, - u64 *shadow_pte, u64 access_bits, int write_fault, - gfn_t gfn) + u64 *shadow_pte, u64 access_bits, + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { gpa_t gaddr; - ASSERT(*shadow_pte == 0); access_bits &= *gpde; gaddr = (gpa_t)gfn << PAGE_SHIFT; if (PTTYPE == 32 && is_cpuid_PSE36()) gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); - *shadow_pte = *gpde & PT_PTE_COPY_MASK; + *shadow_pte |= *gpde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, - gpde, access_bits, write_fault, gfn); + gpde, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, int write_fault) + struct guest_walker *walker, + int user_fault, int write_fault, int *ptwrite) { hpa_t shadow_addr; int level; @@ -330,7 +363,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) - return shadow_ent; + break; shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; prev_shadow_ent = shadow_ent; continue; @@ -365,94 +398,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (prev_shadow_ent) *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, guest_ent, shadow_ent, - walker->inherited_ar, write_fault, walker->gfn); + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); FNAME(set_pte)(vcpu, guest_ent, shadow_ent, - walker->inherited_ar, - write_fault, walker->gfn); + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); } return shadow_ent; } -/* - * The guest faulted for write. We need to - * - * - check write permissions - * - update the guest pte dirty bit - * - update our own dirty page tracking structures - */ -static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, - u64 *shadow_ent, - struct guest_walker *walker, - gva_t addr, - int user, - int *write_pt) -{ - pt_element_t *guest_ent; - int writable_shadow; - gfn_t gfn; - struct kvm_mmu_page *page; - - if (is_writeble_pte(*shadow_ent)) - return !user || (*shadow_ent & PT_USER_MASK); - - writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; - if (user) { - /* - * User mode access. Fail if it's a kernel page or a read-only - * page. - */ - if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) - return 0; - ASSERT(*shadow_ent & PT_USER_MASK); - } else - /* - * Kernel mode access. Fail if it's a read-only page and - * supervisor write protection is enabled. - */ - if (!writable_shadow) { - if (is_write_protection(vcpu)) - return 0; - *shadow_ent &= ~PT_USER_MASK; - } - - guest_ent = walker->ptep; - - if (!is_present_pte(*guest_ent)) { - *shadow_ent = 0; - return 0; - } - - gfn = walker->gfn; - - if (user) { - /* - * Usermode page faults won't be for page table updates. - */ - while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", - __FUNCTION__, gfn, page->role.word); - kvm_mmu_zap_page(vcpu, page); - } - } else if (kvm_mmu_lookup_page(vcpu, gfn)) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - mark_page_dirty(vcpu->kvm, gfn); - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - *guest_ent |= PT_DIRTY_MASK; - *write_pt = 1; - return 0; - } - mark_page_dirty(vcpu->kvm, gfn); - *shadow_ent |= PT_WRITABLE_MASK; - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - *guest_ent |= PT_DIRTY_MASK; - rmap_add(vcpu, shadow_ent); - - return 1; -} - /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -475,7 +431,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *shadow_pte; - int fixed; int write_pt = 0; int r; @@ -503,19 +458,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, write_fault); - pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, - shadow_pte, *shadow_pte); - - /* - * Update the shadow pte. - */ - if (write_fault) - fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, - user_fault, &write_pt); - - pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, - shadow_pte, *shadow_pte); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + &write_pt); + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + shadow_pte, *shadow_pte, write_pt); FNAME(release_walker)(&walker); -- cgit v1.2.3 From e663ee64aefc57f7eff7325142206c4ea0200be8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 15:46:04 +0300 Subject: KVM: MMU: Make setting shadow ptes atomic on i386 Signed-off-by: Avi Kivity --- drivers/kvm/Kconfig | 1 + drivers/kvm/mmu.c | 14 ++++++++++++-- drivers/kvm/paging_tmpl.h | 4 ++-- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig index 2f661e5f0dae..33fa28a8c199 100644 --- a/drivers/kvm/Kconfig +++ b/drivers/kvm/Kconfig @@ -11,6 +11,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on X86 && EXPERIMENTAL + depends on X86_CMPXCHG64 || 64BIT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 3cdbf687df25..f24b540148aa 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "vmx.h" #include "kvm.h" @@ -204,6 +205,15 @@ static int is_rmap_pte(u64 pte) == (PT_WRITABLE_MASK | PT_PRESENT_MASK); } +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min, gfp_t gfp_flags) @@ -446,7 +456,7 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); kvm_arch_ops->tlb_flush(vcpu); - *spte &= ~(u64)PT_WRITABLE_MASK; + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); } } @@ -699,7 +709,7 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, } BUG_ON(!parent_pte); kvm_mmu_put_page(vcpu, page, parent_pte); - *parent_pte = 0; + set_shadow_pte(parent_pte, 0); } kvm_mmu_page_unlink_children(vcpu, page); if (!page->root_count) { diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 397a4039eaad..fabc2c9093cd 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -234,7 +234,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, spte |= gaddr; spte |= PT_SHADOW_IO_MARK; spte &= ~PT_PRESENT_MASK; - *shadow_pte = spte; + set_shadow_pte(shadow_pte, spte); return; } @@ -280,7 +280,7 @@ unshadowed: if (access_bits & PT_WRITABLE_MASK) mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); - *shadow_pte = spte; + set_shadow_pte(shadow_pte, spte); page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); if (!was_rmapped) rmap_add(vcpu, shadow_pte); -- cgit v1.2.3 From fd97dc516c372982f9c3637e20b131e1f55ac2f6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 18:20:14 +0300 Subject: KVM: MMU: Simpify accessed/dirty/present/nx bit handling Always set the accessed and dirty bit (since having them cleared causes a read-modify-write cycle), always set the present bit, and copy the nx bit from the guest. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 5 ----- drivers/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index f24b540148aa..b47391ffe549 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -91,11 +91,6 @@ static int dbg = 1; #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) -#define PT32_PTE_COPY_MASK \ - (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK) - -#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK) - #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 59b4cb29e0f7..b17a4b783cd4 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) - #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #else @@ -46,7 +45,6 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) - #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK #define PT_MAX_FULL_LEVELS 2 #else #error Invalid PTTYPE value @@ -219,7 +217,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); } - spte |= *gpte & PT_PTE_COPY_MASK; + spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; + spte |= *gpte & PT64_NX_MASK; spte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; @@ -495,7 +494,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) #undef PT_INDEX #undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK -#undef PT_PTE_COPY_MASK -#undef PT_NON_PTE_COPY_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_MAX_FULL_LEVELS -- cgit v1.2.3 From b64b3763a5b3868e85330c891e1a30189dcde9b1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 18:24:09 +0300 Subject: KVM: MMU: Don't cache guest access bits in the shadow page table This was once used to avoid accessing the guest pte when upgrading the shadow pte from read-only to read-write. But usually we need to set the guest pte dirty or accessed bits anyway, so this wasn't really exploited. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 8 -------- drivers/kvm/paging_tmpl.h | 1 - 2 files changed, 9 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index b47391ffe549..986d01294f3b 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -97,14 +97,6 @@ static int dbg = 1; #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1) -#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT) - -#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1) -#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT)) - -#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT) - #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index b17a4b783cd4..adc1206cf659 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -219,7 +219,6 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; spte |= *gpte & PT64_NX_MASK; - spte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; -- cgit v1.2.3 From bd2b2baa5c5fbb08b4b0df7508ff419407f7ece6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 18:28:51 +0300 Subject: KVM: MMU: Remove unused large page marker This has not been used for some time, as the same information is available in the page header. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 1 - drivers/kvm/paging_tmpl.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 986d01294f3b..283df031b03d 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -94,7 +94,6 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define VALID_PAGE(x) ((x) != INVALID_PAGE) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index adc1206cf659..a7c5cb0319ea 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -384,8 +384,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (walker->level == PT_DIRECTORY_LEVEL) { - if (prev_shadow_ent) - *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, guest_ent, shadow_ent, walker->inherited_ar, user_fault, write_fault, ptwrite, walker, walker->gfn); -- cgit v1.2.3 From 17c3ba9d37dbda490792a2b52953f09d0dee30d6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 4 Jun 2007 15:58:30 +0300 Subject: KVM: Lazy guest cr3 switching Switch guest paging context may require us to allocate memory, which might fail. Instead of wiring up error paths everywhere, make context switching lazy and actually do the switch before the next guest entry, where we can return an error if allocation fails. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 10 ++++++++++ drivers/kvm/mmu.c | 43 ++++++++++++++++++++++--------------------- drivers/kvm/svm.c | 4 ++++ drivers/kvm/vmx.c | 4 ++++ 4 files changed, 40 insertions(+), 21 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 199e1e9bae25..3ec4e26b9bd7 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -544,6 +544,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); @@ -555,6 +557,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, return vcpu->mmu.page_fault(vcpu, gva, error_code); } +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 283df031b03d..5915d7a1c4f7 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -949,9 +949,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa); + context->root_hpa = INVALID_PAGE; return 0; } @@ -965,11 +963,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); mmu_free_roots(vcpu); - if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); - kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } static void inject_page_fault(struct kvm_vcpu *vcpu, @@ -1003,10 +996,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->free = paging_free; context->root_level = level; context->shadow_root_level = level; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1025,10 +1015,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->free = paging_free; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1042,7 +1029,6 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - mmu_topup_memory_caches(vcpu); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) @@ -1063,17 +1049,32 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - destroy_kvm_mmu(vcpu); - r = init_kvm_mmu(vcpu); - if (r < 0) - goto out; + spin_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mmu_alloc_roots(vcpu); + kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); out: + spin_unlock(&vcpu->kvm->lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 6cd6a50a0340..ec040e2f8c58 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1483,6 +1483,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return r; + if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 93e5bb2c40e3..4d255493a57e 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1988,6 +1988,10 @@ again: vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + /* * Loading guest fpu may have cleared host cr0.ts */ -- cgit v1.2.3 From 7b53aa56508479507c6e5667bb252ca7c2cd19cf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Jun 2007 12:17:03 +0300 Subject: KVM: Fix vcpu freeing for guest smp A vcpu can pin up to four mmu shadow pages, which means the freeing loop will never terminate. Fix by first unpinning shadow pages on all vcpus, then freeing shadow pages. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 15 +++++++++++++++ drivers/kvm/mmu.c | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 3c3231d8dabf..3ff8ee56279c 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -381,6 +381,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) } } +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->vmcs) + return; + + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { if (!vcpu->vmcs) @@ -401,6 +411,11 @@ static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + /* + * Unpin any mmu pages first. + */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + kvm_unload_vcpu_mmu(&kvm->vcpus[i]); for (i = 0; i < KVM_MAX_VCPUS; ++i) kvm_free_vcpu(&kvm->vcpus[i]); } diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 5915d7a1c4f7..d4de988d1828 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -838,11 +838,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *page; + if (!VALID_PAGE(vcpu->mmu.root_hpa)) + return; #ifdef CONFIG_X86_64 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->mmu.root_hpa; - ASSERT(VALID_PAGE(root)); page = page_header(root); --page->root_count; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -853,7 +854,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->mmu.pae_root[i]; if (root) { - ASSERT(VALID_PAGE(root)); root &= PT64_BASE_ADDR_MASK; page = page_header(root); --page->root_count; -- cgit v1.2.3 From d9e368d61263055eceac2966bb7ea31b89da3425 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2007 19:18:30 +0300 Subject: KVM: Flush remote tlbs when reducing shadow pte permissions When a vcpu causes a shadow tlb entry to have reduced permissions, it must also clear the tlb on remote vcpus. We do that by: - setting a bit on the vcpu that requests a tlb flush before the next entry - if the vcpu is currently executing, we send an ipi to make sure it exits before we continue Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 8 ++++++++ drivers/kvm/kvm_main.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ drivers/kvm/mmu.c | 8 +++++--- drivers/kvm/svm.c | 17 ++++++++++++----- drivers/kvm/vmx.c | 22 +++++++++++++++------- 5 files changed, 84 insertions(+), 15 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 528a56b1790e..b08272bce213 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -83,6 +83,11 @@ #define KVM_PIO_PAGE_OFFSET 1 +/* + * vcpu->requests bit members + */ +#define KVM_TLB_FLUSH 0 + /* * Address types: * @@ -272,6 +277,8 @@ struct kvm_vcpu { u64 host_tsc; struct kvm_run *run; int interrupt_window_open; + int guest_mode; + unsigned long requests; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) unsigned long irq_pending[NR_IRQ_WORDS]; @@ -530,6 +537,7 @@ void save_msrs(struct vmx_msr_entry *e, int n); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_flush_remote_tlbs(struct kvm *kvm); int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 4e1a017f3db7..633c2eded08d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include "x86_emulate.h" #include "segment_descriptor.h" @@ -309,6 +311,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); } +static void ack_flush(void *_completed) +{ + atomic_t *completed = _completed; + + atomic_inc(completed); +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu, needed; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + atomic_t completed; + + atomic_set(&completed, 0); + cpus_clear(cpus); + needed = 0; + for (i = 0; i < kvm->nvcpus; ++i) { + vcpu = &kvm->vcpus[i]; + if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != raw_smp_processor_id()) + if (!cpu_isset(cpu, cpus)) { + cpu_set(cpu, cpus); + ++needed; + } + } + + /* + * We really want smp_call_function_mask() here. But that's not + * available, so ipi all cpus in parallel and wait for them + * to complete. + */ + for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus)) + smp_call_function_single(cpu, ack_flush, &completed, 1, 0); + while (atomic_read(&completed) != needed) { + cpu_relax(); + barrier(); + } +} + static struct kvm *kvm_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index d4de988d1828..ad50cfda5ac1 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -441,7 +441,7 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); } } @@ -656,7 +656,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, rmap_remove(vcpu, &pt[i]); pt[i] = 0; } - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); return; } @@ -669,6 +669,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); } + kvm_flush_remote_tlbs(vcpu->kvm); } static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, @@ -1093,6 +1094,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } } *spte = 0; + kvm_flush_remote_tlbs(vcpu->kvm); } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, @@ -1308,7 +1310,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) } mmu_free_memory_caches(vcpu); - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); init_kvm_mmu(vcpu); } diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 70f386e04cbe..eb175c5cd499 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1470,6 +1470,11 @@ static void load_db_regs(unsigned long *db_regs) asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); } +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u16 fs_selector; @@ -1487,6 +1492,11 @@ again: clgi(); + vcpu->guest_mode = 1; + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + svm_flush_tlb(vcpu); + pre_svm_run(vcpu); save_host_msrs(vcpu); @@ -1618,6 +1628,8 @@ again: #endif : "cc", "memory" ); + vcpu->guest_mode = 0; + if (vcpu->fpu_active) { fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); @@ -1682,11 +1694,6 @@ again: return r; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu) -{ - force_new_asid(vcpu); -} - static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { vcpu->svm->vmcb->save.cr3 = root; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a1f51b9d482d..b969db1e0830 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1972,6 +1972,11 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); +} + static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; @@ -1997,9 +2002,15 @@ again: */ vmcs_writel(HOST_CR0, read_cr0()); + local_irq_disable(); + + vcpu->guest_mode = 1; + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + vmx_flush_tlb(vcpu); + asm ( /* Store host registers */ - "pushf \n\t" #ifdef CONFIG_X86_64 "push %%rax; push %%rbx; push %%rdx;" "push %%rsi; push %%rdi; push %%rbp;" @@ -2091,7 +2102,6 @@ again: "pop %%ecx; popa \n\t" #endif "setbe %0 \n\t" - "popf \n\t" : "=q" (fail) : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), "c"(vcpu), @@ -2115,6 +2125,9 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); + vcpu->guest_mode = 0; + local_irq_enable(); + ++vcpu->stat.exits; vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; @@ -2167,11 +2180,6 @@ out: return r; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ - vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); -} - static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 err_code) -- cgit v1.2.3 From 88a97f0b2fe1cd08d06390dc2669b709ea96e11a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Jun 2007 17:13:26 +0800 Subject: KVM: MMU: Fix Wrong tlb flush order Need to flush the tlb after updating a pte, not before. Signed-off-by: Shaohua Li Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index ad50cfda5ac1..49ffbd3da749 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -441,8 +441,8 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); - kvm_flush_remote_tlbs(vcpu->kvm); set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + kvm_flush_remote_tlbs(vcpu->kvm); } } -- cgit v1.2.3 From e495606dd09d79f9fa496334ac3958f6ff179d82 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Jun 2007 14:15:57 -0400 Subject: KVM: Clean up #includes Remove unnecessary ones, and rearange the remaining in the standard order. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 18 +++++++----------- drivers/kvm/mmu.c | 10 ++++++---- drivers/kvm/svm.c | 7 ++++--- drivers/kvm/vmx.c | 5 +++-- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'drivers/kvm/mmu.c') diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 26ca90f74fc7..ea027190a658 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -16,37 +16,33 @@ */ #include "kvm.h" +#include "x86_emulate.h" +#include "segment_descriptor.h" #include #include #include -#include -#include #include #include -#include #include #include #include -#include #include -#include #include #include #include -#include #include #include -#include -#include -#include #include #include #include #include -#include "x86_emulate.h" -#include "segment_descriptor.h" +#include +#include +#include +#include +#include MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 49ffbd3da749..b297a6b111ac 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -16,16 +16,18 @@ * the COPYING file in the top-level directory. * */ + +#include "vmx.h" +#include "kvm.h" + #include #include -#include #include #include #include -#include -#include "vmx.h" -#include "kvm.h" +#include +#include #undef MMU_DEBUG diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index a0d442883e17..bc818cc126e3 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -14,16 +14,17 @@ * */ +#include "kvm_svm.h" +#include "x86_emulate.h" + #include #include #include #include #include #include -#include -#include "kvm_svm.h" -#include "x86_emulate.h" +#include MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 7d04ffaaf94a..80628f69916d 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -17,17 +17,18 @@ #include "kvm.h" #include "vmx.h" +#include "segment_descriptor.h" + #include #include #include #include #include #include + #include #include -#include "segment_descriptor.h" - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -- cgit v1.2.3