diff options
34 files changed, 861 insertions, 493 deletions
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 35eca377543d..8138201efb09 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows: can be taken inside a kvm->srcu read-side critical section, while kvm->slots_lock cannot. +- kvm->mn_active_invalidate_count ensures that pairs of + invalidate_range_start() and invalidate_range_end() callbacks + use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock + are taken on the waiting side in install_new_memslots, so MMU notifiers + must not take either kvm->slots_lock or kvm->slots_arch_lock. + On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..6365087f3160 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index dc6591548f0c..f38dfe195ef2 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots_raw(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4527ac7b5961..02574d7b3612 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1953,7 +1953,7 @@ out: static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (gfn >= memslots[slot].base_gfn && @@ -1974,7 +1974,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) if (gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); } return start; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 974cbfb1eefe..6a73ff7db5f9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -482,6 +482,7 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; + bool is_paused; }; struct kvm_pmu { @@ -1002,9 +1003,8 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { bool long_mode; - bool shinfo_set; u8 upcall_vector; - struct gfn_to_hva_cache shinfo_cache; + gfn_t shinfo_gfn; }; enum kvm_irqchip_mode { @@ -1209,6 +1209,7 @@ struct kvm_vm_stat { u64 lpages; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; + u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { @@ -1536,12 +1537,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); @@ -1772,11 +1773,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline u32 get_rdx_init_val(void) -{ - return 0x600; /* P6 family */ -} - static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a6e218c6140d..5a69cce4d72d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (!kvm_vcpu_is_bsp(vcpu) || !pit) + /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ + if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 11e4065e1617..bbd4a5d18b5d 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -35,11 +35,7 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba5a27879f1d..76fb00921203 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; + WARN_ONCE(!irqchip_in_kernel(kvm), + "Dirty APIC map without an in-kernel local APIC"); + mutex_lock(&kvm->arch.apic_map_lock); /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map @@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) @@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) struct kvm_lapic *apic = vcpu->arch.apic; int i; + if (!init_event) { + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + if (!apic) return; @@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE); + apic->base_address = APIC_DEFAULT_PHYS_BASE; + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); @@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - kvm_lapic_set_base(vcpu, - vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { @@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } - /* - * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC state has changed. - */ - vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c4f4fa23320e..bdf8197d6f84 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -137,12 +137,22 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -/* make pte_list_desc fit well in cache line */ -#define PTE_LIST_EXT 3 +/* make pte_list_desc fit well in cache lines */ +#define PTE_LIST_EXT 14 +/* + * Slight optimization of cacheline layout, by putting `more' and `spte_count' + * at the start; then accessing it will only use one single cacheline for + * either full (entries==PTE_LIST_EXT) case or entries<=6. + */ struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; + /* + * Stores number of entries stored in the pte_list_desc. No need to be + * u64 but just for easier alignment. When PTE_LIST_EXT, means full. + */ + u64 spte_count; + u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { @@ -592,9 +602,9 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. - * Returns non-zero if the PTE was previously valid. + * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(u64 *sptep) +static u64 mmu_spte_clear_track_bits(u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; @@ -605,7 +615,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_shadow_present_pte(old_spte)) - return 0; + return old_spte; pfn = spte_to_pfn(old_spte); @@ -622,7 +632,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); - return 1; + return old_spte; } /* @@ -686,28 +696,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -786,7 +804,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, +static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; @@ -799,12 +817,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, } } -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } @@ -893,7 +911,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - int i, count = 0; + int count = 0; if (!rmap_head->val) { rmap_printk("%p %llx 0->1\n", spte, *spte); @@ -903,24 +921,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; + desc->spte_count = 2; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1]) { + while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; - if (!desc->more) { desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; + desc->spte_count = 0; break; } desc = desc->more; } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; + count += desc->spte_count; + desc->sptes[desc->spte_count++] = spte; } return count; } @@ -930,13 +948,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) { - int j; + int j = desc->spte_count - 1; - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; - if (j != 0) + desc->spte_count--; + if (desc->spte_count) return; if (!prev_desc && !desc->more) rmap_head->val = 0; @@ -969,7 +986,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(rmap_head, desc, i, prev_desc); @@ -990,24 +1007,41 @@ static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) __pte_list_remove(sptep, rmap_head); } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +/* Return true if rmap existed, false otherwise */ +static bool pte_list_destroy(struct kvm_rmap_head *rmap_head) { - unsigned long idx; + struct pte_list_desc *desc, *next; + int i; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + mmu_spte_clear_track_bits((u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + for (; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; } -static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, - struct kvm_mmu_page *sp) +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, + const struct kvm_memory_slot *slot) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; + unsigned long idx; - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1020,24 +1054,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } + static void rmap_remove(struct kvm *kvm, u64 *spte) { + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmap_head = gfn_to_rmap(kvm, gfn, sp); + + /* + * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the + * context of a vCPU so have to determine which memslots to use based + * on context information in sp->role. + */ + slots = kvm_memslots_for_spte_role(kvm, sp->role); + + slot = __gfn_to_memslot(slots, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + __pte_list_remove(spte, rmap_head); } @@ -1119,7 +1168,9 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (mmu_spte_clear_track_bits(sptep)) + u64 old_spte = mmu_spte_clear_track_bits(sptep); + + if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } @@ -1218,7 +1269,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1256,8 +1307,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1289,8 +1340,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ @@ -1356,7 +1407,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); + rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } } @@ -1377,20 +1428,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1442,7 +1482,7 @@ restart: struct slot_rmap_walk_iterator { /* input fields. */ - struct kvm_memory_slot *slot; + const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; @@ -1462,14 +1502,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, - iterator->slot); + iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - struct kvm_memory_slot *slot, int start_level, + const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; @@ -1584,12 +1623,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, @@ -2696,6 +2736,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (is_shadow_present_pte(*sptep)) { if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > vcpu->kvm->stat.max_mmu_rmap_size) + vcpu->kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } @@ -2939,15 +2981,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; drop_large_spte(vcpu, it.sptep); - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - - link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + if (is_shadow_present_pte(*it.sptep)) + continue; + + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); + if (is_tdp && huge_page_disallowed && + req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, @@ -3096,15 +3139,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + +/* + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + */ +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +{ struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3115,14 +3183,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3180,8 +3249,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3194,8 +3262,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3614,6 +3681,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3621,8 +3690,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3636,8 +3703,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3649,11 +3714,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -3828,11 +3897,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -5134,7 +5201,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - if (WARN_ON_ONCE(r == RET_PF_INVALID)) + if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5276,12 +5343,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) @@ -5308,7 +5376,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool flush_on_yield) { @@ -5319,7 +5387,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, @@ -5578,7 +5646,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush = slot_handle_level_range(kvm, memslot, + flush = slot_handle_level_range(kvm, + (const struct kvm_memory_slot *) memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); @@ -5606,13 +5675,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level) { bool flush = false; @@ -5648,7 +5717,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5687,10 +5756,8 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *slot) { - /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { @@ -5726,7 +5793,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { bool flush = false; diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index cedc17b2f60e..9e7dcf999f08 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 35567293c1fd..ca7b7595bbfc 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -124,8 +124,8 @@ static inline bool is_nx_huge_page_enabled(void) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); @@ -140,6 +140,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index efbad33a0645..2924a4081a19 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0853370bd811..dab6cb46cdb2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,7 +10,7 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = false; +static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ @@ -527,6 +527,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -538,15 +542,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +/* + * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a + * TDP page fault. + * + * @vcpu: The vcpu instance that took the TDP page fault. + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, + struct tdp_iter *iter, + u64 new_spte) { + struct kvm *kvm = vcpu->kvm; + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) return false; - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, iter->level); + /* + * Use kvm_vcpu_gfn_to_memslot() instead of going through + * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. + */ + if (is_writable_pte(new_spte)) { + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); + + if (slot && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON_ONCE(iter->level > PG_LEVEL_4K); + mark_page_dirty_in_slot(kvm, slot, iter->gfn); + } + } + return true; } @@ -559,7 +588,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -927,7 +956,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) return RET_PF_RETRY; /* @@ -1031,8 +1060,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, - new_spte)) { + if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { tdp_mmu_link_page(vcpu->kvm, sp, true, huge_page_disallowed && req_level >= iter.level); @@ -1242,8 +1270,8 @@ retry: * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level) +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1313,7 +1341,8 @@ retry: * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1516,6 +1545,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1527,14 +1558,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1cae4485b3bc..b224d126adf9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -61,10 +61,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level); +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level); bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, - struct kvm_memory_slot *slot); + const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, @@ -77,8 +77,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 827886c12c16..0772bad9165c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; } static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return; /* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false; clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 67e753edfa22..0e4f2b1fa9fb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running; counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 61738ff8ef33..5e13357da21e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -659,11 +659,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) goto out; } - - /* Clear internal status */ - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7fbce342eec4..81601c31f549 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -584,6 +584,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xcr0 = svm->vcpu.arch.xcr0; save->pkru = svm->vcpu.arch.pkru; save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; /* * SEV-ES will use a VMSA that is pointed to by the VMCB, not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e8ccab50ebf6..9d72b1df426e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1161,8 +1161,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - vcpu->arch.hflags = 0; - svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); svm_set_intercept(svm, INTERCEPT_CR4_READ); @@ -1241,29 +1239,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; + save->gdtr.base = 0; save->gdtr.limit = 0xffff; + save->idtr.base = 0; save->idtr.limit = 0xffff; init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(vcpu, 0); - svm_set_efer(vcpu, 0); - save->dr6 = 0xffff0ff0; - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - save->rip = 0x0000fff0; - vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; - - /* - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - * It also updates the guest-visible cr0 value. - */ - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(vcpu); - - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; @@ -1273,14 +1256,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = vcpu->arch.pat; save->cr3 = 0; - save->cr4 = 0; } svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = INVALID_GPA; svm->nested.last_vmcb12_gpa = INVALID_GPA; - vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; @@ -1330,23 +1311,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); - u32 dummy; - u32 eax = 1; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } init_vmcb(vcpu); - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); - kvm_rdx_write(vcpu, eax); - if (kvm_vcpu_apicv_active(vcpu) && !init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } @@ -1560,7 +1530,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); } } @@ -2077,11 +2047,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) return -EINVAL; /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. + * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put + * the VMCB in a known good state. Unfortuately, KVM doesn't have + * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking + * userspace. At a platform view, INIT is acceptable behavior as + * there exist bare metal platforms that automatically INIT the CPU + * in response to shutdown. */ clear_page(svm->vmcb); - init_vmcb(vcpu); + kvm_vcpu_reset(vcpu, true); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a52134b0c42..0d0dd6580cfd 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4267,7 +4267,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, seg.l = 1; else seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, @@ -4278,17 +4278,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .g = 1 }; seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, @@ -4296,14 +4296,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .type = 11, .present = 1 }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); @@ -4382,9 +4383,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9efc1a6b8693..10cc4f65c4ef 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..ae8e62df16dd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, } /* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. */ -static void setup_msrs(struct vcpu_vmx *vmx) +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; @@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. @@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (is_unrestricted_guest(vcpu) || - (enable_ept && is_paging(vcpu))) + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); break; } } @@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } - vmx_set_segment(vcpu, save, seg); + __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); return 0; } @@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + unsigned long hw_cr0, old_cr0_pg; + u32 tmp; + + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); } + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (enable_ept && !is_unrestricted_guest(vcpu)) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ + if (!(cr0 & X86_CR0_PG)) { + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); @@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; + return; } vmcs_writel(sf->base, var->base); @@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} -out: - vmx->emulation_required = emulation_required(vcpu); +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (secondary_exec_controls_get(to_vmx(vcpu)) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; @@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + if (!cpu_has_vmx_msr_bitmap()) return; + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + vmx_reset_x2apic_msrs(vcpu, mode); /* @@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, mode); - - vmx->msr_bitmap_mode = mode; -} - void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } pt_update_intercept_for_msr(vcpu); - vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4118,8 +4127,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } u32 vmx_exec_control(struct vcpu_vmx *vmx) @@ -4388,32 +4396,35 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -4436,16 +4447,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4458,31 +4459,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - vmx_update_exception_bitmap(vcpu); - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4996,6 +4977,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -5021,14 +5003,13 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -5338,7 +5319,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - WARN_ON_ONCE(!enable_vnmi); + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5896,7 +5879,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * below) should never happen as that means we incorrectly allowed a * nested VM-Enter with an invalid vmcs12. */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -6189,7 +6173,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) } secondary_exec_controls_set(vmx, sec_exec_control); - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) @@ -6274,7 +6258,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) int max_irr; bool max_irr_updated; - WARN_ON(!vcpu->arch.apicv_active); + if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + return -EIO; + if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* @@ -6357,7 +6343,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; - if (WARN_ONCE(!is_external_intr(intr_info), + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6838,7 +6824,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } - vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); @@ -7154,6 +7139,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; + vmx_setup_uret_msrs(vmx); + if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); vmcs_set_secondary_exec_control(vmx); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 17a1cb4b059d..4f151175d45a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -227,7 +227,7 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; - u8 msr_bitmap_mode; + u8 x2apic_msr_bitmap_mode; /* * If true, host state has been stored in vmx->loaded_vmcs for @@ -371,12 +371,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5d5c5ed7dd4..3cedc7cc132a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -235,6 +235,7 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, mmu_unsync), STATS_DESC_ICOUNTER(VM, lpages), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == @@ -6567,9 +6568,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -9263,7 +9264,6 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - struct kvm_vcpu *except; unsigned long old, new, expected; if (!kvm_x86_ops.check_apicv_inhibit_reasons || @@ -9289,16 +9289,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) if (kvm_x86_ops.pre_update_apicv_exec_ctrl) static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9395,6 +9386,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + r = -EIO; + goto out; + } if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; @@ -9976,7 +9971,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } @@ -10581,9 +10577,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; @@ -10799,6 +10792,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10865,10 +10860,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. + * But, go through the motions in case that's ever remedied. + */ + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = 0x600; + kvm_rdx_write(vcpu, eax); + vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + /* * Reset the MMU context if paging was enabled prior to INIT (which is * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the @@ -10879,7 +10905,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) kvm_mmu_reset_context(vcpu); + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -11123,6 +11162,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); } @@ -11481,7 +11521,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -11561,10 +11601,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ae17250e1efe..9ea9c3dabe37 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; - int ret; + int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, - gpa, PAGE_SIZE); - if (ret) + if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { + ret = -EFAULT; goto out; - - kvm->arch.xen.shinfo_set = true; + } + kvm->arch.xen.shinfo_gfn = gfn; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_set = false; + kvm->arch.xen.shinfo_gfn = GPA_INVALID; r = 0; break; } @@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_set) - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); - else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); r = 0; break; @@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } +void kvm_xen_init_vm(struct kvm *kvm) +{ + kvm->arch.xen.shinfo_gfn = GPA_INVALID; +} + void kvm_xen_destroy_vm(struct kvm *kvm) { if (kvm->arch.xen_hvm_config.msr) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 463a7844a8ca..cc0cf5f37450 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) @@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) return 1; } +static inline void kvm_xen_init_vm(struct kvm *kvm) +{ +} + static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ae7735b490b4..492d183dd7d0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,6 +150,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 +#define KVM_REQ_VM_BUGGED (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -158,6 +159,15 @@ static inline bool is_error_page(struct page *page) }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) +bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, + unsigned long *vcpu_bitmap, cpumask_var_t tmp); +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except); +bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, + unsigned long *vcpu_bitmap); + #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -344,6 +354,13 @@ struct kvm_vcpu { struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; + + /* + * The index of the most recently used memslot by this vCPU. It's ok + * if this becomes stale due to memslot changes since we always check + * it is a valid slot. + */ + int last_used_slot; }; /* must be called with irqs disabled */ @@ -512,7 +529,7 @@ struct kvm_memslots { u64 generation; /* The mapping table from slot id to the index in memslots[]. */ short id_to_index[KVM_MEM_SLOTS_NUM]; - atomic_t lru_slot; + atomic_t last_used_slot; int used_slots; struct kvm_memory_slot memslots[]; }; @@ -538,6 +555,11 @@ struct kvm { struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + /* Used to wait for completion of MMU notifiers. */ + spinlock_t mn_invalidate_lock; + unsigned long mn_active_invalidate_count; + struct rcuwait mn_memslots_update_rcuwait; + /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only @@ -596,6 +618,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool vm_bugged; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -629,6 +652,30 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline void kvm_vm_bugged(struct kvm *kvm) +{ + kvm->vm_bugged = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED); +} + +#define KVM_BUG(cond, kvm, fmt...) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + +#define KVM_BUG_ON(cond, kvm) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); @@ -720,6 +767,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); +bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); @@ -943,15 +991,6 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, - unsigned long *vcpu_bitmap, cpumask_var_t tmp); -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except); -bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, - unsigned long *vcpu_bitmap); - long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, @@ -1157,29 +1196,49 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* - * search_memslots() and __gfn_to_memslot() are here because they are - * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. - * gfn_to_memslot() itself isn't here as an inline because that would - * bloat other code too much. + * Returns a pointer to the memslot at slot_index if it contains gfn. + * Otherwise returns NULL. + */ +static inline struct kvm_memory_slot * +try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + if (slot_index < 0 || slot_index >= slots->used_slots) + return NULL; + + /* + * slot_index can come from vcpu->last_used_slot which is not kept + * in sync with userspace-controllable memslot deletion. So use nospec + * to prevent the CPU from speculating past the end of memslots[]. + */ + slot_index = array_index_nospec(slot_index, slots->used_slots); + slot = &slots->memslots[slot_index]; + + if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) + return slot; + else + return NULL; +} + +/* + * Returns a pointer to the memslot that contains gfn and records the index of + * the slot in index. Otherwise returns NULL. * * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); struct kvm_memory_slot *memslots = slots->memslots; + struct kvm_memory_slot *slot; if (unlikely(!slots->used_slots)) return NULL; - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return &memslots[slot]; - while (start < end) { - slot = start + (end - start) / 2; + int slot = start + (end - start) / 2; if (gfn >= memslots[slot].base_gfn) end = slot; @@ -1187,19 +1246,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) start = slot + 1; } - if (start < slots->used_slots && gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); - return &memslots[start]; + slot = try_get_memslot(slots, start, gfn); + if (slot) { + *index = start; + return slot; } return NULL; } +/* + * __gfn_to_memslot() and its descendants are here because it is called from + * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot() + * itself isn't here as an inline because that would bloat other code too much. + */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { - return search_memslots(slots, gfn); + struct kvm_memory_slot *slot; + int slot_index = atomic_read(&slots->last_used_slot); + + slot = try_get_memslot(slots, slot_index, gfn); + if (slot) + return slot; + + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + atomic_set(&slots->last_used_slot, slot_index); + return slot; + } + + return NULL; } static inline unsigned long diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index e2baa187a21e..71e277c7c3f3 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -222,8 +222,6 @@ static void *vcpu_thread_main(void *arg) int vcpu_id = vcpu_args->vcpu_id; int current_iteration = -1; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); - while (spin_wait_for_next_iteration(¤t_iteration)) { switch (READ_ONCE(iteration_work)) { case ITERATION_ACCESS_MEMORY: @@ -333,7 +331,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pthread_t *vcpu_threads; int vcpus = params->vcpus; - vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, + vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, params->backing_src); perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index b74704305835..e79c1b64977f 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -52,7 +52,6 @@ static void *vcpu_worker(void *data) struct timespec start; struct timespec ts_diff; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); clock_gettime(CLOCK_MONOTONIC, &start); @@ -293,7 +292,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; int r; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type); perf_test_args.wr_fract = 1; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 80cbd3a748c0..3c30d0045d8d 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -44,7 +44,6 @@ static void *vcpu_worker(void *data) struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); while (!READ_ONCE(host_quit)) { @@ -94,8 +93,59 @@ struct test_params { int wr_fract; bool partition_vcpu_memory_access; enum vm_mem_backing_src_type backing_src; + int slots; }; +static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; + + vm_mem_region_set_flags(vm, slot, flags); + } +} + +static inline void enable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, true); +} + +static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, false); +} + +static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, + uint64_t nr_pages) +{ + uint64_t slot_pages = nr_pages / slots; + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + unsigned long *slot_bitmap = bitmap + i * slot_pages; + + kvm_vm_get_dirty_log(vm, slot, slot_bitmap); + } +} + +static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, + uint64_t nr_pages) +{ + uint64_t slot_pages = nr_pages / slots; + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + unsigned long *slot_bitmap = bitmap + i * slot_pages; + + kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages); + } +} + static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; @@ -114,7 +164,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec clear_dirty_log_total = (struct timespec){0}; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - p->backing_src); + p->slots, p->backing_src); perf_test_args.wr_fract = p->wr_fract; @@ -163,8 +213,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Enable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, - KVM_MEM_LOG_DIRTY_PAGES); + enable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -190,8 +239,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap); - + get_dirty_log(vm, p->slots, bmap, host_num_pages); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -200,9 +248,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); - + clear_dirty_log(vm, p->slots, bmap, host_num_pages); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -213,7 +259,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0); + disable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -244,7 +290,8 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] " - "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name); + "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" + "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -263,6 +310,8 @@ static void help(char *name) " them into a separate region of memory for each vCPU.\n"); printf(" -s: specify the type of memory that should be used to\n" " back the guest data region.\n\n"); + printf(" -x: Split the memory region into this number of memslots.\n" + " (default: 1)"); backing_src_help(); puts(""); exit(0); @@ -276,6 +325,7 @@ int main(int argc, char *argv[]) .wr_fract = 1, .partition_vcpu_memory_access = true, .backing_src = VM_MEM_SRC_ANONYMOUS, + .slots = 1, }; int opt; @@ -286,7 +336,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) { + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) { switch (opt) { case 'i': p.iterations = atoi(optarg); @@ -316,6 +366,9 @@ int main(int argc, char *argv[]) case 's': p.backing_src = parse_backing_src_type(optarg); break; + case 'x': + p.slots = atoi(optarg); + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 005f2143adeb..df9f1a3a3ffb 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -44,7 +44,7 @@ extern struct perf_test_args perf_test_args; extern uint64_t guest_test_phys_mem; struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src); void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index b488f4aefea8..0ef80dbdc116 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -50,11 +50,12 @@ static void guest_code(uint32_t vcpu_id) } struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src) { struct kvm_vm *vm; uint64_t guest_num_pages; + int i; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); @@ -68,6 +69,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, "Guest memory size is not host page size aligned."); TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, @@ -95,10 +99,16 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - /* Add an extra memory slot for testing */ - vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem, - PERF_TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = guest_test_phys_mem + + region_pages * perf_test_args.guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + PERF_TEST_MEM_SLOT_INDEX + i, + region_pages, 0); + } /* Do mapping for the demand paging memory slot */ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); @@ -140,6 +150,8 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, vcpu_gpa = guest_test_phys_mem; } + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + (vcpu_args->pages * perf_test_args.guest_page_size)); diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 98351ba0933c..4cfcafea9f5a 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -45,7 +45,6 @@ static void *vcpu_worker(void *data) struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); /* Let the guest access its memory until a stop signal is received */ @@ -105,7 +104,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; int vcpu_id; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS); perf_test_args.wr_fract = 1; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b50dbe269f4b..c115e2648d9d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -415,6 +415,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + vcpu->last_used_slot = 0; } void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -496,17 +497,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - /* The on_lock() path does not yet support lock elision. */ - if (!IS_KVM_NULL_FN(range->on_lock)) { - locked = true; - KVM_MMU_LOCK(kvm); - - range->on_lock(kvm, range->start, range->end); - - if (IS_KVM_NULL_FN(range->handler)) - goto out_unlock; - } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, slots) { @@ -538,6 +528,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (!locked) { locked = true; KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm, range->start, range->end); + if (IS_KVM_NULL_FN(range->handler)) + break; } ret |= range->handler(kvm, &gfn_range); } @@ -546,7 +540,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) kvm_flush_remote_tlbs(kvm); -out_unlock: if (locked) KVM_MMU_UNLOCK(kvm); @@ -604,11 +597,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, trace_kvm_set_spte_hva(address); /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(), - * and so always runs with an elevated notifier count. This obviates - * the need to bump the sequence count. + * .change_pte() must be surrounded by .invalidate_range_{start,end}(). + * If mmu_notifier_count is zero, then no in-progress invalidations, + * including this one, found a relevant memslot at start(); rechecking + * memslots here is unnecessary. Note, a false positive (count elevated + * by a different invalidation) is sub-optimal but functionally ok. */ - WARN_ON_ONCE(!kvm->mmu_notifier_count); + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + if (!READ_ONCE(kvm->mmu_notifier_count)) + return; kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } @@ -658,6 +655,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, trace_kvm_unmap_hva_range(range->start, range->end); + /* + * Prevent memslot modification between range_start() and range_end() + * so that conditionally locking provides the same result in both + * functions. Without that guarantee, the mmu_notifier_count + * adjustments will be imbalanced. + * + * Pairs with the decrement in range_end(). + */ + spin_lock(&kvm->mn_invalidate_lock); + kvm->mn_active_invalidate_count++; + spin_unlock(&kvm->mn_invalidate_lock); + __kvm_handle_hva_range(kvm, &hva_range); return 0; @@ -694,9 +703,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; + bool wake; __kvm_handle_hva_range(kvm, &hva_range); + /* Pairs with the increment in range_start(). */ + spin_lock(&kvm->mn_invalidate_lock); + wake = (--kvm->mn_active_invalidate_count == 0); + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * There can only be one waiter, since the wait happens under + * slots_lock. + */ + if (wake) + rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + BUG_ON(kvm->mmu_notifier_count < 0); } @@ -991,6 +1013,9 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); + spin_lock_init(&kvm->mn_invalidate_lock); + rcuwait_init(&kvm->mn_memslots_update_rcuwait); + INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1113,6 +1138,16 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); + /* + * At this point, pending calls to invalidate_range_start() + * have completed but no more MMU notifiers will run, so + * mn_active_invalidate_count may remain unbalanced. + * No threads can be waiting in install_new_memslots as the + * last reference on KVM has been dropped, but freeing + * memslots would deadlock without this manual intervention. + */ + WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); + kvm->mn_active_invalidate_count = 0; #else kvm_arch_flush_shadow_all(kvm); #endif @@ -1134,6 +1169,16 @@ void kvm_get_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_get_kvm); +/* + * Make sure the vm is not during destruction, which is a safe version of + * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. + */ +bool kvm_get_kvm_safe(struct kvm *kvm) +{ + return refcount_inc_not_zero(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); + void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) @@ -1194,8 +1239,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, slots->used_slots--; - if (atomic_read(&slots->lru_slot) >= slots->used_slots) - atomic_set(&slots->lru_slot, 0); + if (atomic_read(&slots->last_used_slot) >= slots->used_slots) + atomic_set(&slots->last_used_slot, 0); for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { mslots[i] = mslots[i + 1]; @@ -1364,7 +1409,22 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + /* + * Do not store the new memslots while there are invalidations in + * progress, otherwise the locking in invalidate_range_start and + * invalidate_range_end will be unbalanced. + */ + spin_lock(&kvm->mn_invalidate_lock); + prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); + while (kvm->mn_active_invalidate_count) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&kvm->mn_invalidate_lock); + schedule(); + spin_lock(&kvm->mn_invalidate_lock); + } + finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); + spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize @@ -1980,7 +2040,26 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memory_slot *slot; + int slot_index; + + slot = try_get_memslot(slots, vcpu->last_used_slot, gfn); + if (slot) + return slot; + + /* + * Fall back to searching all memslots. We purposely use + * search_memslots() instead of __gfn_to_memslot() to avoid + * thrashing the VM-wide last_used_index in kvm_memslots. + */ + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + vcpu->last_used_slot = slot_index; + return slot; + } + + return NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); @@ -3612,7 +3691,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) @@ -3822,7 +3901,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -3888,7 +3967,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; - if (dev->kvm->mm != current->mm) + if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -4210,7 +4289,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -4421,7 +4500,7 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT @@ -4983,12 +5062,12 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, struct kvm_stat_data *stat_data = (struct kvm_stat_data *) inode->i_private; - /* The debugfs files are a reference to the kvm struct which - * is still valid when kvm_destroy_vm is called. - * To avoid the race between open and the removal of the debugfs - * directory we test against the users count. + /* + * The debugfs files are a reference to the kvm struct which + * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe + * avoids the race between open and the removal of the debugfs directory. */ - if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; if (simple_attr_open(inode, file, get, |