diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2025-03-19 16:04:33 +0300 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-03-19 16:04:33 +0300 |
commit | 4286a3ec2595697f398ce1dc895a2f5b006dae99 (patch) | |
tree | e87fd74c521a2569e22de4b7798216aee0bf2b51 | |
parent | e335300095c370149aada9783df2d7bf5b0db7c7 (diff) | |
parent | 0dab791f05ce2c9f0215f50cb46ed0c3126fe211 (diff) | |
download | linux-4286a3ec2595697f398ce1dc895a2f5b006dae99.tar.xz |
Merge tag 'kvm-x86-mmu-6.15' of https://github.com/kvm-x86/linux into HEAD
KVM x86/mmu changes for 6.15
Add support for "fast" aging of SPTEs in both the TDP MMU and Shadow MMU, where
"fast" means "without holding mmu_lock". Not taking mmu_lock allows multiple
aging actions to run in parallel, and more importantly avoids stalling vCPUs,
e.g. due to holding mmu_lock for an extended duration while a vCPU is faulting
in memory.
For the TDP MMU, protect aging via RCU; the page tables are RCU-protected and
KVM doesn't need to access any metadata to age SPTEs.
For the Shadow MMU, use bit 1 of rmap pointers (bit 0 is used to terminate a
list of rmaps) to implement a per-rmap single-bit spinlock. When aging a gfn,
acquire the rmap's spinlock with read-only permissions, which allows hardening
and optimizing the locking and aging, e.g. locking an rmap for write requires
mmu_lock to also be held. The lock is NOT a true R/W spinlock, i.e. multiple
concurrent readers aren't supported.
To avoid forcing all SPTE updates to use atomic operations (clearing the
Accessed bit out of mmu_lock makes it inherently volatile), rework and rename
spte_has_volatile_bits() to spte_needs_atomic_update() and deliberately exclude
the Accessed bit. KVM (and mm/) already tolerates false positives/negatives
for Accessed information, and all testing has shown that reducing the latency
of aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
-rw-r--r-- | Documentation/virt/kvm/locking.rst | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 365 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 31 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 45 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 1 | ||||
-rw-r--r-- | virt/kvm/Kconfig | 4 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 55 |
11 files changed, 375 insertions, 171 deletions
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index c56d5f26c750..ae8bce7fecbe 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -196,7 +196,7 @@ writable between reading spte and updating spte. Like below case: The Dirty bit is lost in this case. In order to avoid this kind of issue, we always treat the spte as "volatile" -if it can be updated out of mmu-lock [see spte_has_volatile_bits()]; it means +if it can be updated out of mmu-lock [see spte_needs_atomic_update()]; it means the spte is always atomically updated in this case. 3) flush tlbs due to spte updated @@ -212,7 +212,7 @@ function to update spte (present -> present). Since the spte is "volatile" if it can be updated out of mmu-lock, we always atomically update the spte and the race caused by fast page fault can be avoided. -See the comments in spte_has_volatile_bits() and mmu_spte_update(). +See the comments in spte_needs_atomic_update() and mmu_spte_update(). Lockless Access Tracking: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32ae3aa50c7e..667b93071252 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include <linux/kfifo.h> #include <linux/sched/vhost_task.h> #include <linux/call_once.h> +#include <linux/atomic.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> @@ -405,7 +406,7 @@ union kvm_cpu_role { }; struct kvm_rmap_head { - unsigned long val; + atomic_long_t val; }; struct kvm_pio_request { @@ -1479,6 +1480,7 @@ struct kvm_arch { * tdp_mmu_page set. * * For reads, this list is protected by: + * RCU alone or * the MMU lock in read mode + RCU or * the MMU lock in write mode * diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8ea8c097de 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM_X86 select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG + select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8160870398b9..63bb77ee1bb1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return false; } - if (!spte_has_volatile_bits(old_spte)) + if (!spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); @@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || - !spte_has_volatile_bits(old_spte)) + !spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); @@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings. */ #define KVM_RMAP_MANY BIT(0) /* + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always + * operates with mmu_lock held for write), but rmaps can be walked without + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain + * being zapped/dropped _while the rmap is locked_. + * + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be + * done while holding mmu_lock for write. This allows a task walking rmaps + * without holding mmu_lock to concurrently walk the same entries as a task + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify + * the rmaps, thus the walks are stable. + * + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, + * only the rmap chains themselves are protected. E.g. holding an rmap's lock + * ensures all "struct pte_list_desc" fields are stable. + */ +#define KVM_RMAP_LOCKED BIT(1) + +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + unsigned long old_val, new_val; + + lockdep_assert_preemption_disabled(); + + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ + old_val = atomic_long_read(&rmap_head->val); + if (!old_val) + return 0; + + do { + /* + * If the rmap is locked, wait for it to be unlocked before + * trying acquire the lock, e.g. to avoid bouncing the cache + * line. + */ + while (old_val & KVM_RMAP_LOCKED) { + cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); + } + + /* + * Recheck for an empty rmap, it may have been purged by the + * task that held the lock. + */ + if (!old_val) + return 0; + + new_val = old_val | KVM_RMAP_LOCKED; + /* + * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap + * from being reordered outside of the critical section created by + * __kvm_rmap_lock(). + * + * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). + * + * For the !old_val case, no ordering is needed, as there is no rmap + * to walk. + */ + } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val)); + + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ + return old_val; +} + +static unsigned long kvm_rmap_lock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + return __kvm_rmap_lock(rmap_head); +} + +static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, + unsigned long val) +{ + KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); + /* + * Ensure that all accesses to the rmap have completed before unlocking + * the rmap. + * + * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock(). + */ + atomic_long_set_release(&rmap_head->val, val); +} + +static void kvm_rmap_unlock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + unsigned long new_val) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + __kvm_rmap_unlock(rmap_head, new_val); +} + +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) +{ + return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED; +} + +/* + * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The + * actual locking is the same, but the caller is disallowed from modifying the + * rmap, and so the unlock flow is a nop if the rmap is/was empty. + */ +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + unsigned long rmap_val; + + preempt_disable(); + rmap_val = __kvm_rmap_lock(rmap_head); + + if (!rmap_val) + preempt_enable(); + + return rmap_val; +} + +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); + + __kvm_rmap_unlock(rmap_head, old_val); + preempt_enable(); +} + +/* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, - struct kvm_rmap_head *rmap_head) +static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + u64 *spte, struct kvm_rmap_head *rmap_head) { + unsigned long old_val, new_val; struct pte_list_desc *desc; int count = 0; - if (!rmap_head->val) { - rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & KVM_RMAP_MANY)) { + old_val = kvm_rmap_lock(kvm, rmap_head); + + if (!old_val) { + new_val = (unsigned long)spte; + } else if (!(old_val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->sptes[0] = (u64 *)rmap_head->val; + desc->sptes[0] = (u64 *)old_val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; + } else { + new_val = old_val; } desc->sptes[desc->spte_count++] = spte; } + + kvm_rmap_unlock(kvm, rmap_head, new_val); + return count; } -static void pte_list_desc_remove_entry(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) - rmap_head->val = 0; + *rmap_val = 0; else - rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; + *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; + unsigned long rmap_val; int i; - if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) - return; + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) + goto out; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) - return; + if (!(rmap_val & KVM_RMAP_MANY)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm)) + goto out; - rmap_head->val = 0; + rmap_val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(kvm, rmap_head, + pte_list_desc_remove_entry(kvm, &rmap_val, desc, i); - return; + goto out; } } desc = desc->more; @@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } + +out: + kvm_rmap_unlock(kvm, rmap_head, rmap_val); } static void kvm_zap_one_rmap_spte(struct kvm *kvm, @@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; + unsigned long rmap_val; int i; - if (!rmap_head->val) + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (!rmap_val) return false; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + if (!(rmap_val & KVM_RMAP_MANY)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + kvm_rmap_unlock(kvm, rmap_head, 0); return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { + unsigned long rmap_val = kvm_rmap_get(rmap_head); struct pte_list_desc *desc; - if (!rmap_head->val) + if (!rmap_val) return 0; - else if (!(rmap_head->val & KVM_RMAP_MANY)) + else if (!(rmap_val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) */ struct rmap_iterator { /* private fields */ + struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; @@ -1067,23 +1221,19 @@ struct rmap_iterator { static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { - u64 *sptep; + unsigned long rmap_val = kvm_rmap_get(rmap_head); - if (!rmap_head->val) + if (!rmap_val) return NULL; - if (!(rmap_head->val & KVM_RMAP_MANY)) { + if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_head->val; - goto out; + return (u64 *)rmap_val; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1093,14 +1243,11 @@ out: */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \ + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (spte_ad_need_write_protect(*sptep)) flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); + } return flush; } @@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); - if (iterator->rmap->val) + if (atomic_long_read(&iterator->rmap->val)) return; } @@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm, kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(cache, spte, rmap_head); + rmap_count = pte_list_add(kvm, cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; @@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, bool test_only) + struct kvm_gfn_range *range, + bool test_only) { - struct slot_rmap_walk_iterator iterator; + struct kvm_rmap_head *rmap_head; struct rmap_iterator iter; + unsigned long rmap_val; bool young = false; u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) { - for_each_rmap_spte(iterator.rmap, &iter, sptep) { - u64 spte = *sptep; + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); - if (!is_accessed_spte(spte)) - continue; + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; - if (test_only) - return true; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * WARN if mmu_spte_update() signals the need - * for a TLB flush, as Access tracking a SPTE - * should never trigger an _immediate_ flush. - */ - spte = mark_spte_for_access_track(spte); - WARN_ON_ONCE(mmu_spte_update(sptep, spte)); + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } + + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; } - young = true; + + kvm_rmap_unlock_readonly(rmap_head, rmap_val); } } return young; } +static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm) +{ + return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, false); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_age_gfn_range(kvm, range); + young = kvm_tdp_mmu_age_gfn_range(kvm, range); + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, false); return young; } @@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, true); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_test_age_gfn(kvm, range); + young = kvm_tdp_mmu_test_age_gfn(kvm, range); + + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, true); return young; } @@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, +static void mmu_page_add_parent_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(cache, parent_pte, &sp->parent_ptes); + pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm, mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(cache, sp, sptep); + mmu_page_add_parent_pte(kvm, cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For @@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.guest_mode && !child->parent_ptes.val) + child->role.guest_mode && + !atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 22551e2f1d00..0f9f47b4ab0e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } /* - * Returns true if the SPTE has bits that may be set without holding mmu_lock. - * The caller is responsible for checking if the SPTE is shadow-present, and - * for determining whether or not the caller cares about non-leaf SPTEs. + * Returns true if the SPTE needs to be updated atomically due to having bits + * that may be changed without holding mmu_lock, and for which KVM must not + * lose information. E.g. KVM must not drop Dirty bit information. The caller + * is responsible for checking if the SPTE is shadow-present, and for + * determining whether or not the caller cares about non-leaf SPTEs. */ -bool spte_has_volatile_bits(u64 spte) +bool spte_needs_atomic_update(u64 spte) { + /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; - if (is_access_track_spte(spte)) + /* + * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked + * SPTEs can be restored by KVM's fast page fault handler. + */ + if (!spte_ad_enabled(spte)) return true; - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; + /* + * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed + * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't + * invalidate TLBs when aging SPTEs, and so it's safe to clobber the + * Accessed bit (and rare in practice). + */ + return is_writable_pte(spte) && !(spte & shadow_dirty_mask); } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 59746854c0af..79cdceba9857 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -bool spte_has_volatile_bits(u64 spte); +bool spte_needs_atomic_update(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 047b78333653..364c5da6c499 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) return xchg(rcu_dereference(sptep), new_spte); } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); @@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) } /* - * SPTEs must be modified atomically if they are shadow-present, leaf - * SPTEs, and have volatile bits, i.e. has bits that can be set outside - * of mmu_lock. The Writable bit can be set by KVM's fast page fault - * handler, and Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs, + * and have volatile bits (bits that can be set outside of mmu_lock) that + * must not be clobbered. */ -static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level) { return is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte); + spte_needs_atomic_update(old_spte); } static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); @@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level) { - atomic64_t *sptep_atomic; - - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { - sptep_atomic = (atomic64_t *)rcu_dereference(sptep); - return (u64)atomic64_fetch_and(~mask, sptep_atomic); - } + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); return old_spte; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 046b6ba31197..7cc0564f5f97 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, !tdp_mmu_root_match((_root), (_types)))) { \ } else +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ + } else + #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) @@ -774,9 +787,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, continue; \ else -#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \ - for_each_tdp_pte(_iter, _kvm, _root, _start, _end) - static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, struct tdp_iter *iter) { @@ -1235,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); - tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { + for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { int r; if (fault->nx_huge_page_workaround_enabled) @@ -1332,21 +1342,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; if (spte_ad_enabled(iter->old_spte)) { - iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, - iter->old_spte, - shadow_accessed_mask, - iter->level); + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { new_spte = mark_spte_for_access_track(iter->old_spte); - iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, - iter->old_spte, new_spte, - iter->level); + /* + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. + */ + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, @@ -1371,9 +1382,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, * valid roots! */ WARN_ON(types & ~KVM_VALID_ROOTS); - __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) { - guard(rcu)(); + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { if (!is_accessed_spte(iter.old_spte)) continue; @@ -1382,7 +1393,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, return true; ret = true; - kvm_tdp_mmu_age_spte(&iter); + kvm_tdp_mmu_age_spte(kvm, &iter); } } @@ -1904,7 +1915,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->root_role.level; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } @@ -1931,7 +1942,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, struct tdp_iter iter; tdp_ptep_t sptep = NULL; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..c28a6aa1f2ed 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -267,6 +267,7 @@ struct kvm_gfn_range { union kvm_mmu_notifier_arg arg; enum kvm_gfn_range_filter attr_filter; bool may_block; + bool lockless; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 54e959e7d68f..746e1f466aa6 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -104,6 +104,10 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG depends on KVM_GENERIC_MMU_NOTIFIER bool +config KVM_MMU_LOCKLESS_AGING + depends on KVM_GENERIC_MMU_NOTIFIER + bool + config KVM_GENERIC_MEMORY_ATTRIBUTES depends on KVM_GENERIC_MMU_NOTIFIER bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d3..201c14ff476f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -517,6 +517,7 @@ struct kvm_mmu_notifier_range { on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; + bool lockless; }; /* @@ -551,8 +552,8 @@ static void kvm_null_fn(void) node; \ node = interval_tree_iter_next(node, start, last)) \ -static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_mmu_notifier_range *range) +static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_mmu_notifier_range *range) { struct kvm_mmu_notifier_return r = { .ret = false, @@ -571,6 +572,10 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, IS_KVM_NULL_FN(range->handler))) return r; + /* on_lock will never be called for lockless walks */ + if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock))) + return r; + idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { @@ -607,15 +612,18 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, gfn_range.start = hva_to_gfn_memslot(hva_start, slot); gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; + gfn_range.lockless = range->lockless; if (!r.found_memslot) { r.found_memslot = true; - KVM_MMU_LOCK(kvm); - if (!IS_KVM_NULL_FN(range->on_lock)) - range->on_lock(kvm); - - if (IS_KVM_NULL_FN(range->handler)) - goto mmu_unlock; + if (!range->lockless) { + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + + if (IS_KVM_NULL_FN(range->handler)) + goto mmu_unlock; + } } r.ret |= range->handler(kvm, &gfn_range); } @@ -625,7 +633,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); mmu_unlock: - if (r.found_memslot) + if (r.found_memslot && !range->lockless) KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); @@ -633,7 +641,7 @@ mmu_unlock: return r; } -static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, +static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler, @@ -647,17 +655,18 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .on_lock = (void *)kvm_null_fn, .flush_on_ret = flush_on_ret, .may_block = false, + .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING), }; - return __kvm_handle_hva_range(kvm, &range).ret; + return kvm_handle_hva_range(kvm, &range).ret; } -static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler) +static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + gfn_handler_t handler) { - return kvm_handle_hva_range(mn, start, end, handler, false); + return kvm_age_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) @@ -752,7 +761,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow. */ - if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) + if (kvm_handle_hva_range(kvm, &hva_range).found_memslot) kvm_arch_guest_memory_reclaimed(kvm); return 0; @@ -798,7 +807,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, }; bool wake; - __kvm_handle_hva_range(kvm, &hva_range); + kvm_handle_hva_range(kvm, &hva_range); /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); @@ -822,8 +831,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, { trace_kvm_age_hva(start, end); - return kvm_handle_hva_range(mn, start, end, kvm_age_gfn, - !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); + return kvm_age_hva_range(mn, start, end, kvm_age_gfn, + !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -846,7 +855,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ - return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); + return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, @@ -855,8 +864,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, { trace_kvm_test_age_hva(address); - return kvm_handle_hva_range_no_flush(mn, address, address + 1, - kvm_test_age_gfn); + return kvm_age_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, |