summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r--arch/x86/kvm/mmu/mmu.c601
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h85
-rw-r--r--arch/x86/kvm/mmu/page_track.c3
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c60
-rw-r--r--arch/x86/kvm/mmu/spte.h8
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c10
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h55
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c420
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h51
10 files changed, 955 insertions, 341 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2401606db260..4e06e2e89a8f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed;
#ifdef CONFIG_X86_64
bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+EXPORT_SYMBOL_GPL(tdp_mmu_enabled);
#endif
static int max_huge_page_level __read_mostly;
@@ -501,7 +502,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
return false;
}
- if (!spte_has_volatile_bits(old_spte))
+ if (!spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
@@ -524,7 +525,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
int level = sptep_to_sp(sptep)->role.level;
if (!is_shadow_present_pte(old_spte) ||
- !spte_has_volatile_bits(old_spte))
+ !spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
@@ -599,6 +600,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
return r;
+ if (kvm_has_mirrored_tdp(vcpu->kvm)) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL);
if (r)
@@ -618,6 +625,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
@@ -846,32 +854,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* About rmap_head encoding:
*
* If the bit zero of rmap_head->val is clear, then it points to the only spte
- * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
* pte_list_desc containing more mappings.
*/
#define KVM_RMAP_MANY BIT(0)
/*
+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
+ * operates with mmu_lock held for write), but rmaps can be walked without
+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
+ * being zapped/dropped _while the rmap is locked_.
+ *
+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
+ * done while holding mmu_lock for write. This allows a task walking rmaps
+ * without holding mmu_lock to concurrently walk the same entries as a task
+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
+ * the rmaps, thus the walks are stable.
+ *
+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
+ * ensures all "struct pte_list_desc" fields are stable.
+ */
+#define KVM_RMAP_LOCKED BIT(1)
+
+static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
+ * entries to the rmap. I.e. the only paths that process empty rmaps
+ * do so while holding mmu_lock for write, and are mutually exclusive.
+ */
+ old_val = atomic_long_read(&rmap_head->val);
+ if (!old_val)
+ return 0;
+
+ do {
+ /*
+ * If the rmap is locked, wait for it to be unlocked before
+ * trying acquire the lock, e.g. to avoid bouncing the cache
+ * line.
+ */
+ while (old_val & KVM_RMAP_LOCKED) {
+ cpu_relax();
+ old_val = atomic_long_read(&rmap_head->val);
+ }
+
+ /*
+ * Recheck for an empty rmap, it may have been purged by the
+ * task that held the lock.
+ */
+ if (!old_val)
+ return 0;
+
+ new_val = old_val | KVM_RMAP_LOCKED;
+ /*
+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
+ * from being reordered outside of the critical section created by
+ * __kvm_rmap_lock().
+ *
+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
+ *
+ * For the !old_val case, no ordering is needed, as there is no rmap
+ * to walk.
+ */
+ } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
+
+ /*
+ * Return the old value, i.e. _without_ the LOCKED bit set. It's
+ * impossible for the return value to be 0 (see above), i.e. the read-
+ * only unlock flow can't get a false positive and fail to unlock.
+ */
+ return old_val;
+}
+
+static unsigned long kvm_rmap_lock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __kvm_rmap_lock(rmap_head);
+}
+
+static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
+ unsigned long val)
+{
+ KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
+ /*
+ * Ensure that all accesses to the rmap have completed before unlocking
+ * the rmap.
+ *
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
+ */
+ atomic_long_set_release(&rmap_head->val, val);
+}
+
+static void kvm_rmap_unlock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ unsigned long new_val)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ __kvm_rmap_unlock(rmap_head, new_val);
+}
+
+static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
+{
+ return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
+}
+
+/*
+ * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
+ * actual locking is the same, but the caller is disallowed from modifying the
+ * rmap, and so the unlock flow is a nop if the rmap is/was empty.
+ */
+static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val;
+
+ preempt_disable();
+ rmap_val = __kvm_rmap_lock(rmap_head);
+
+ if (!rmap_val)
+ preempt_enable();
+
+ return rmap_val;
+}
+
+static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
+ unsigned long old_val)
+{
+ if (!old_val)
+ return;
+
+ KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
+
+ __kvm_rmap_unlock(rmap_head, old_val);
+ preempt_enable();
+}
+
+/*
* Returns the number of pointers in the rmap chain, not counting the new one.
*/
-static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
- struct kvm_rmap_head *rmap_head)
+static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ u64 *spte, struct kvm_rmap_head *rmap_head)
{
+ unsigned long old_val, new_val;
struct pte_list_desc *desc;
int count = 0;
- if (!rmap_head->val) {
- rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
+ old_val = kvm_rmap_lock(kvm, rmap_head);
+
+ if (!old_val) {
+ new_val = (unsigned long)spte;
+ } else if (!(old_val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->sptes[0] = (u64 *)rmap_head->val;
+ desc->sptes[0] = (u64 *)old_val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -880,21 +1029,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ } else {
+ new_val = old_val;
}
desc->sptes[desc->spte_count++] = spte;
}
+
+ kvm_rmap_unlock(kvm, rmap_head, new_val);
+
return count;
}
-static void pte_list_desc_remove_entry(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
+static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -921,9 +1074,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
* head at the next descriptor, i.e. the new head.
*/
if (!head_desc->more)
- rmap_head->val = 0;
+ *rmap_val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
+ *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -931,24 +1084,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
+ unsigned long rmap_val;
int i;
- if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
- return;
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
+ goto out;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
- if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
- return;
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
+ goto out;
- rmap_head->val = 0;
+ rmap_val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(kvm, rmap_head,
+ pte_list_desc_remove_entry(kvm, &rmap_val,
desc, i);
- return;
+ goto out;
}
}
desc = desc->more;
@@ -956,6 +1111,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
+
+out:
+ kvm_rmap_unlock(kvm, rmap_head, rmap_val);
}
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -970,17 +1128,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc, *next;
+ unsigned long rmap_val;
int i;
- if (!rmap_head->val)
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (!rmap_val)
return false;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
- mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -990,20 +1150,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
}
out:
/* rmap_head is meaningless now, remember to reset it */
- rmap_head->val = 0;
+ kvm_rmap_unlock(kvm, rmap_head, 0);
return true;
}
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
struct pte_list_desc *desc;
- if (!rmap_head->val)
+ if (!rmap_val)
return 0;
- else if (!(rmap_head->val & KVM_RMAP_MANY))
+ else if (!(rmap_val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1046,6 +1207,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
*/
struct rmap_iterator {
/* private fields */
+ struct rmap_head *head;
struct pte_list_desc *desc; /* holds the sptep if not NULL */
int pos; /* index of the sptep */
};
@@ -1060,23 +1222,19 @@ struct rmap_iterator {
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
struct rmap_iterator *iter)
{
- u64 *sptep;
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
- if (!rmap_head->val)
+ if (!rmap_val)
return NULL;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
+ if (!(rmap_val & KVM_RMAP_MANY)) {
iter->desc = NULL;
- sptep = (u64 *)rmap_head->val;
- goto out;
+ return (u64 *)rmap_val;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
iter->pos = 0;
- sptep = iter->desc->sptes[iter->pos];
-out:
- BUG_ON(!is_shadow_present_pte(*sptep));
- return sptep;
+ return iter->desc->sptes[iter->pos];
}
/*
@@ -1086,14 +1244,11 @@ out:
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
- u64 *sptep;
-
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
++iter->pos;
- sptep = iter->desc->sptes[iter->pos];
- if (sptep)
- goto out;
+ if (iter->desc->sptes[iter->pos])
+ return iter->desc->sptes[iter->pos];
}
iter->desc = iter->desc->more;
@@ -1101,20 +1256,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
- sptep = iter->desc->sptes[iter->pos];
- goto out;
+ return iter->desc->sptes[iter->pos];
}
}
return NULL;
-out:
- BUG_ON(!is_shadow_present_pte(*sptep));
- return sptep;
}
-#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
- for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
- _spte_; _spte_ = rmap_get_next(_iter_))
+#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
+ _sptep_; _sptep_ = rmap_get_next(_iter_))
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
+
+#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1200,12 +1359,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmap_head, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (spte_ad_need_write_protect(*sptep))
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
else
flush |= spte_clear_dirty(sptep);
+ }
return flush;
}
@@ -1297,15 +1457,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
* enabled but it chooses between clearing the Dirty bit and Writeable
* bit based on the context.
*/
- if (kvm_x86_ops.cpu_dirty_log_size)
+ if (kvm->arch.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-int kvm_cpu_dirty_log_size(void)
+int kvm_cpu_dirty_log_size(struct kvm *kvm)
{
- return kvm_x86_ops.cpu_dirty_log_size;
+ return kvm->arch.cpu_dirty_log_size;
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
@@ -1394,7 +1554,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
while (++iterator->rmap <= iterator->end_rmap) {
iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
- if (iterator->rmap->val)
+ if (atomic_long_read(&iterator->rmap->val))
return;
}
@@ -1526,7 +1686,7 @@ static void __rmap_add(struct kvm *kvm,
kvm_update_page_stats(kvm, sp->role.level, 1);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- rmap_count = pte_list_add(cache, spte, rmap_head);
+ rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
if (rmap_count > kvm->stat.max_mmu_rmap_size)
kvm->stat.max_mmu_rmap_size = rmap_count;
@@ -1545,51 +1705,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
}
static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range, bool test_only)
+ struct kvm_gfn_range *range,
+ bool test_only)
{
- struct slot_rmap_walk_iterator iterator;
+ struct kvm_rmap_head *rmap_head;
struct rmap_iterator iter;
+ unsigned long rmap_val;
bool young = false;
u64 *sptep;
+ gfn_t gfn;
+ int level;
+ u64 spte;
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator) {
- for_each_rmap_spte(iterator.rmap, &iter, sptep) {
- u64 spte = *sptep;
+ for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ for (gfn = range->start; gfn < range->end;
+ gfn += KVM_PAGES_PER_HPAGE(level)) {
+ rmap_head = gfn_to_rmap(gfn, level, range->slot);
+ rmap_val = kvm_rmap_lock_readonly(rmap_head);
- if (!is_accessed_spte(spte))
- continue;
+ for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
+ if (!is_accessed_spte(spte))
+ continue;
- if (test_only)
- return true;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * WARN if mmu_spte_update() signals the need
- * for a TLB flush, as Access tracking a SPTE
- * should never trigger an _immediate_ flush.
- */
- spte = mark_spte_for_access_track(spte);
- WARN_ON_ONCE(mmu_spte_update(sptep, spte));
+ if (test_only) {
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ return true;
+ }
+
+ if (spte_ad_enabled(spte))
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ else
+ /*
+ * If the following cmpxchg fails, the
+ * spte is being concurrently modified
+ * and should most likely stay young.
+ */
+ cmpxchg64(sptep, spte,
+ mark_spte_for_access_track(spte));
+ young = true;
}
- young = true;
+
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
}
}
return young;
}
+static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
- if (kvm_memslots_have_rmaps(kvm))
- young = kvm_rmap_age_gfn_range(kvm, range, false);
-
if (tdp_mmu_enabled)
- young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
+ young = kvm_tdp_mmu_age_gfn_range(kvm, range);
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, false);
return young;
}
@@ -1598,11 +1774,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
- if (kvm_memslots_have_rmaps(kvm))
- young = kvm_rmap_age_gfn_range(kvm, range, true);
-
if (tdp_mmu_enabled)
- young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
+ young = kvm_tdp_mmu_test_age_gfn(kvm, range);
+
+ if (young)
+ return young;
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, true);
return young;
}
@@ -1649,13 +1828,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
-static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
+static void mmu_page_add_parent_pte(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
struct kvm_mmu_page *sp, u64 *parent_pte)
{
if (!parent_pte)
return;
- pte_list_add(cache, parent_pte, &sp->parent_ptes);
+ pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -2345,7 +2525,7 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_spte_set(sptep, spte);
- mmu_page_add_parent_pte(cache, sp, sptep);
+ mmu_page_add_parent_pte(kvm, cache, sp, sptep);
/*
* The non-direct sub-pagetable must be updated before linking. For
@@ -2409,7 +2589,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
* avoids retaining a large number of stale nested SPs.
*/
if (tdp_enabled && invalid_list &&
- child->role.guest_mode && !child->parent_ptes.val)
+ child->role.guest_mode &&
+ !atomic_long_read(&child->parent_ptes.val))
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
@@ -2839,7 +3020,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (is_shadow_present_pte(*sptep)) {
- if (prefetch)
+ if (prefetch && is_last_spte(*sptep, level) &&
+ pfn == spte_to_pfn(*sptep))
return RET_PF_SPURIOUS;
/*
@@ -2853,7 +3035,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
child = spte_to_child_sp(pte);
drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
- } else if (pfn != spte_to_pfn(*sptep)) {
+ } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
@@ -3656,8 +3838,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
- if (tdp_mmu_enabled)
- return kvm_tdp_mmu_alloc_root(vcpu);
+ if (tdp_mmu_enabled) {
+ if (kvm_has_mirrored_tdp(vcpu->kvm) &&
+ !VALID_PAGE(mmu->mirror_root_hpa))
+ kvm_tdp_mmu_alloc_root(vcpu, true);
+ kvm_tdp_mmu_alloc_root(vcpu, false);
+ return 0;
+ }
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
@@ -4379,8 +4566,12 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, unsigned int access)
{
struct kvm_memory_slot *slot = fault->slot;
+ struct kvm *kvm = vcpu->kvm;
int ret;
+ if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm))
+ return -EFAULT;
+
/*
* Note that the mmu_invalidate_seq also serves to detect a concurrent
* change in attributes. is_page_fault_stale() will detect an
@@ -4394,7 +4585,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* Now that we have a snapshot of mmu_invalidate_seq we can check for a
* private vs. shared mismatch.
*/
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
@@ -4456,7 +4647,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_mmu_faultin_pfn(vcpu, fault);
@@ -4476,7 +4667,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* overall cost of failing to detect the invalidation until after
* mmu_lock is acquired.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
return RET_PF_RETRY;
}
@@ -4646,19 +4837,6 @@ out_unlock:
}
#endif
-bool kvm_mmu_may_ignore_guest_pat(void)
-{
- /*
- * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
- * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
- * honor the memtype from the guest's PAT so that guest accesses to
- * memory that is DMA'd aren't cached against the guest's wishes. As a
- * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
- * KVM _always_ ignores guest PAT (when EPT is enabled).
- */
- return shadow_memtype_mask;
-}
-
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
#ifdef CONFIG_X86_64
@@ -4669,8 +4847,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
-static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
- u8 *level)
+int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
{
int r;
@@ -4684,6 +4861,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
do {
if (signal_pending(current))
return -EINTR;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
cond_resched();
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
} while (r == RET_PF_RETRY);
@@ -4708,18 +4889,23 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
return -EIO;
}
}
+EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
{
u64 error_code = PFERR_GUEST_FINAL_MASK;
u8 level = PG_LEVEL_4K;
+ u64 direct_bits;
u64 end;
int r;
if (!vcpu->kvm->arch.pre_fault_allowed)
return -EOPNOTSUPP;
+ if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ return -EINVAL;
+
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
@@ -4728,15 +4914,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
if (r)
return r;
+ direct_bits = 0;
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
error_code |= PFERR_PRIVATE_ACCESS;
+ else
+ direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
/*
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
if (r < 0)
return r;
@@ -5022,7 +5211,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
guest_cpuid_is_amd_compatible(vcpu));
}
@@ -5099,7 +5288,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
is_pse, is_amd);
if (!shadow_me_mask)
@@ -5400,12 +5589,19 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
+ int maxpa;
+
+ if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM)
+ maxpa = cpuid_query_maxguestphyaddr(vcpu);
+ else
+ maxpa = cpuid_maxphyaddr(vcpu);
+
/* tdp_root_level is architecture forced level, use it if nonzero */
if (tdp_root_level)
return tdp_root_level;
/* Use 5-level TDP if and only if it's useful/necessary. */
- if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+ if (max_tdp_level == 5 && maxpa <= 48)
return 4;
return max_tdp_level;
@@ -5524,7 +5720,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
union kvm_mmu_page_role root_role;
/* NPT requires CR0.PG=1. */
- WARN_ON_ONCE(cpu_role.base.direct);
+ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
root_role = cpu_role.base;
root_role.level = kvm_mmu_get_tdp_level(vcpu);
@@ -5724,6 +5920,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -5785,6 +5982,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots);
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
int *bytes)
@@ -6095,8 +6293,16 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
else if (r == RET_PF_SPURIOUS)
vcpu->stat.pf_spurious++;
+ /*
+ * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or
+ * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE.
+ * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to
+ * indicate continuing the page fault handling until to the final
+ * page table mapping phase.
+ */
+ WARN_ON_ONCE(r == RET_PF_CONTINUE);
if (r != RET_PF_EMULATE)
- return 1;
+ return r;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
@@ -6272,6 +6478,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
+ mmu->mirror_root_hpa = INVALID_PAGE;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -6441,8 +6648,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (tdp_mmu_enabled)
- kvm_tdp_mmu_invalidate_all_roots(kvm);
+ if (tdp_mmu_enabled) {
+ /*
+ * External page tables don't support fast zapping, therefore
+ * their mirrors must be invalidated separately by the caller.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
+ }
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
@@ -6467,7 +6679,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* lead to use-after-free.
*/
if (tdp_mmu_enabled)
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
void kvm_mmu_init_vm(struct kvm *kvm)
@@ -7035,6 +7247,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm,
.start = slot->base_gfn,
.end = slot->base_gfn + slot->npages,
.may_block = true,
+ .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED,
};
bool flush;
@@ -7090,6 +7303,19 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be valid even though the VM is globally visible. Do nothing,
+ * as such a VM can't have any possible NX huge pages.
+ */
+ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
+
+ if (nx_thread)
+ vhost_task_wake(nx_thread);
+}
+
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
@@ -7150,7 +7376,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
}
mutex_unlock(&kvm_lock);
}
@@ -7220,6 +7446,12 @@ out:
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
+ if (tdp_mmu_enabled) {
+ read_lock(&vcpu->kvm->mmu_lock);
+ mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa,
+ NULL);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ }
free_mmu_pages(&vcpu->arch.root_mmu);
free_mmu_pages(&vcpu->arch.guest_mmu);
mmu_free_memory_caches(vcpu);
@@ -7279,7 +7511,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
mutex_unlock(&kvm_lock);
}
@@ -7411,23 +7643,35 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
return true;
}
-int kvm_mmu_post_init_vm(struct kvm *kvm)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
{
- if (nx_hugepage_mitigation_hard_disabled)
- return 0;
+ struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct vhost_task *nx_thread;
kvm->arch.nx_huge_page_last = get_jiffies_64();
- kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
- kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
- kvm, "kvm-nx-lpage-recovery");
+ nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
+ kvm_nx_huge_page_recovery_worker_kill,
+ kvm, "kvm-nx-lpage-recovery");
- if (!kvm->arch.nx_huge_page_recovery_thread)
- return -ENOMEM;
+ if (IS_ERR(nx_thread))
+ return PTR_ERR(nx_thread);
- vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
+ vhost_task_start(nx_thread);
+
+ /* Make the task visible only once it is fully started. */
+ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
return 0;
}
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
+}
+
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.nx_huge_page_recovery_thread)
@@ -7435,9 +7679,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range)
{
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
/*
* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
@@ -7452,27 +7717,49 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
- return kvm_unmap_gfn_range(kvm, range);
-}
+ if (WARN_ON_ONCE(range->end <= range->start))
+ return false;
-static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
-}
+ /*
+ * If the head and tail pages of the range currently allow a hugepage,
+ * i.e. reside fully in the slot and don't have mixed attributes, then
+ * add each corresponding hugepage range to the ongoing invalidation,
+ * e.g. to prevent KVM from creating a hugepage in response to a fault
+ * for a gfn whose attributes aren't changing. Note, only the range
+ * of gfns whose attributes are being modified needs to be explicitly
+ * unmapped, as that will unmap any existing hugepages.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t start = gfn_round_for_level(range->start, level);
+ gfn_t end = gfn_round_for_level(range->end - 1, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
-static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
-}
+ if ((start != range->start || start + nr_pages > range->end) &&
+ start >= slot->base_gfn &&
+ start + nr_pages <= slot->base_gfn + slot->npages &&
+ !hugepage_test_mixed(slot, start, level))
+ kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
-static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+ if (end == start)
+ continue;
+
+ if ((end + nr_pages) > range->end &&
+ (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+ !hugepage_test_mixed(slot, end, level))
+ kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+ }
+
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
+ return kvm_unmap_gfn_range(kvm, range);
}
+
+
static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, int level, unsigned long attrs)
{
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b00abbe3f6cf..db8f33e4de62 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
+#include "mmu.h"
+
#ifdef CONFIG_KVM_PROVE_MMU
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
@@ -101,7 +103,22 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
- unsigned int unsync_children;
+ union {
+ /* These two members aren't used for TDP MMU */
+ struct {
+ unsigned int unsync_children;
+ /*
+ * Number of writes since the last time traversal
+ * visited this page.
+ */
+ atomic_t write_flooding_count;
+ };
+ /*
+ * Page table page of external PT.
+ * Passed to TDX module, not accessed by KVM.
+ */
+ void *external_spt;
+ };
union {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
@@ -124,9 +141,6 @@ struct kvm_mmu_page {
int clear_spte_count;
#endif
- /* Number of writes since the last time traversal visited this page. */
- atomic_t write_flooding_count;
-
#ifdef CONFIG_X86_64
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
@@ -145,7 +159,36 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
return kvm_mmu_role_as_id(sp->role);
}
-static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
+static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
+{
+ return sp->role.is_mirror;
+}
+
+static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ /*
+ * external_spt is allocated for TDX module to hold private EPT mappings,
+ * TDX module will initialize the page by itself.
+ * Therefore, KVM does not need to initialize or access external_spt.
+ * KVM only interacts with sp->spt for private EPT operations.
+ */
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
+}
+
+static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
+{
+ /*
+ * Since mirror SPs are used only for TDX, which maps private memory
+ * at its "natural" GFN, no mask needs to be applied to them - and, dually,
+ * we expect that the bits is only used for the shared PT.
+ */
+ if (is_mirror_sp(root))
+ return 0;
+ return kvm_gfn_direct_bits(kvm);
+}
+
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
{
/*
* When using the EPT page-modification log, the GPAs in the CPU dirty
@@ -155,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
* being enabled is mandatory as the bits used to denote WP-only SPTEs
* are reserved for PAE paging (32-bit KVM).
*/
- return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
+ return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode;
}
static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
@@ -229,7 +272,12 @@ struct kvm_page_fault {
*/
u8 goal_level;
- /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
gfn_t gfn;
/* The memslot containing gfn. May be NULL. */
@@ -268,9 +316,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
*
* Note, all values must be greater than or equal to zero so as not to encroach
- * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
- * will allow for efficient machine code when checking for CONTINUE, e.g.
- * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
+ * on -errno return values.
*/
enum {
RET_PF_CONTINUE = 0,
@@ -282,6 +328,14 @@ enum {
RET_PF_SPURIOUS,
};
+/*
+ * Define RET_PF_CONTINUE as 0 to allow for
+ * - efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero,
+ * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
+ */
+static_assert(RET_PF_CONTINUE == 0);
+
static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -317,10 +371,19 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int r;
if (vcpu->arch.mmu->root_role.direct) {
- fault.gfn = fault.addr >> PAGE_SHIFT;
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
+ /*
+ * With retpoline being active an indirect call is rather expensive,
+ * so do a direct call in the most common case.
+ */
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 561c331fd6ec..1b17b12393a8 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm)
struct kvm_memory_slot *slot;
int r = 0, i, bkt;
+ if (kvm->arch.vm_type == KVM_X86_TDX_VM)
+ return -EOPNOTSUPP;
+
mutex_lock(&kvm->slots_arch_lock);
/*
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f4711674c47b..68e323568e95 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -510,8 +510,7 @@ error:
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
- walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
- EPT_VIOLATION_RWX_SHIFT;
+ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 22551e2f1d00..cfce03d8f123 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value;
u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
-u64 __read_mostly shadow_memtype_mask;
u64 __read_mostly shadow_me_value;
u64 __read_mostly shadow_me_mask;
u64 __read_mostly shadow_acc_track_mask;
@@ -96,8 +95,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
-
access &= shadow_mmio_access_mask;
spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
@@ -129,25 +126,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
}
/*
- * Returns true if the SPTE has bits that may be set without holding mmu_lock.
- * The caller is responsible for checking if the SPTE is shadow-present, and
- * for determining whether or not the caller cares about non-leaf SPTEs.
+ * Returns true if the SPTE needs to be updated atomically due to having bits
+ * that may be changed without holding mmu_lock, and for which KVM must not
+ * lose information. E.g. KVM must not drop Dirty bit information. The caller
+ * is responsible for checking if the SPTE is shadow-present, and for
+ * determining whether or not the caller cares about non-leaf SPTEs.
*/
-bool spte_has_volatile_bits(u64 spte)
+bool spte_needs_atomic_update(u64 spte)
{
+ /* SPTEs can be made Writable bit by KVM's fast page fault handler. */
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
- if (is_access_track_spte(spte))
+ /*
+ * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
+ * SPTEs can be restored by KVM's fast page fault handler.
+ */
+ if (!spte_ad_enabled(spte))
return true;
- if (spte_ad_enabled(spte)) {
- if (!(spte & shadow_accessed_mask) ||
- (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
- return true;
- }
-
- return false;
+ /*
+ * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
+ * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
+ * invalidate TLBs when aging SPTEs, and so it's safe to clobber the
+ * Accessed bit (and rare in practice).
+ */
+ return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -170,7 +174,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
- else if (kvm_mmu_page_ad_need_write_protect(sp))
+ else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
spte |= shadow_present_mask;
@@ -205,9 +209,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
- if (shadow_memtype_mask)
- spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= shadow_host_writable_mask;
else
@@ -433,6 +435,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
+{
+ kvm->arch.shadow_mmio_value = mmio_value;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value);
+
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
{
/* shadow_me_value must be a subset of shadow_me_mask */
@@ -456,13 +464,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
shadow_present_mask =
(has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
- /*
- * EPT overrides the host MTRRs, and so KVM must program the desired
- * memtype directly into the SPTEs. Note, this mask is just the mask
- * of all bits that factor into the memtype, the actual memtype must be
- * dynamically calculated, e.g. to ensure host MMIO is mapped UC.
- */
- shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
+
shadow_acc_track_mask = VMX_EPT_RWX_MASK;
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
@@ -514,12 +516,6 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_x_mask = 0;
shadow_present_mask = PT_PRESENT_MASK;
- /*
- * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
- * memtype in the SPTEs, i.e. relies on host MTRRs to provide the
- * correct memtype (WB is the "weakest" memtype).
- */
- shadow_memtype_mask = 0;
shadow_acc_track_mask = 0;
shadow_me_mask = 0;
shadow_me_value = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index af10bc0380a3..1e94f081bdaf 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value;
extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
-extern u64 __read_mostly shadow_memtype_mask;
extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
@@ -276,6 +275,11 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
+static inline bool is_mirror_sptep(tdp_ptep_t sptep)
+{
+ return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
@@ -514,7 +518,7 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
-bool spte_has_volatile_bits(u64 spte);
+bool spte_needs_atomic_update(u64 spte);
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 04c247bfe318..9e17bfa80901 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
- SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+ SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
@@ -37,15 +37,17 @@ void tdp_iter_restart(struct tdp_iter *iter)
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn)
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
{
if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
- (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ (root->role.level > PT64_ROOT_MAX_LEVEL) ||
+ (gfn_bits && next_last_level_gfn >= gfn_bits))) {
iter->valid = false;
return;
}
iter->next_last_level_gfn = next_last_level_gfn;
+ iter->gfn_bits = gfn_bits;
iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
@@ -113,7 +115,7 @@ static bool try_step_side(struct tdp_iter *iter)
* Check if the iterator is already at the end of the current page
* table.
*/
- if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
+ if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
(SPTE_ENT_PER_PAGE - 1))
return false;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 2880fd392e0c..364c5da6c499 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
return xchg(rcu_dereference(sptep), new_spte);
}
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+ atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
@@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
}
/*
- * SPTEs must be modified atomically if they are shadow-present, leaf
- * SPTEs, and have volatile bits, i.e. has bits that can be set outside
- * of mmu_lock. The Writable bit can be set by KVM's fast page fault
- * handler, and Accessed and Dirty bits can be set by the CPU.
- *
- * Note, non-leaf SPTEs do have Accessed bits and those bits are
- * technically volatile, but KVM doesn't consume the Accessed bit of
- * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
- * logic needs to be reassessed if KVM were to use non-leaf Accessed
- * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
+ * and have volatile bits (bits that can be set outside of mmu_lock) that
+ * must not be clobbered.
*/
-static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
{
return is_shadow_present_pte(old_spte) &&
is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte);
+ spte_needs_atomic_update(old_spte);
}
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
- if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
@@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
u64 mask, int level)
{
- atomic64_t *sptep_atomic;
-
- if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
- sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
- return (u64)atomic64_fetch_and(~mask, sptep_atomic);
- }
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
return old_spte;
@@ -93,8 +89,10 @@ struct tdp_iter {
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
- /* The lowest GFN mapped by the current SPTE */
+ /* The lowest GFN (mask bits excluded) mapped by the current SPTE */
gfn_t gfn;
+ /* Mask applied to convert the GFN to the mapping GPA */
+ gfn_t gfn_bits;
/* The level of the root page given to the iterator */
int root_level;
/* The lowest level the iterator should traverse to */
@@ -122,18 +120,23 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
- for (tdp_iter_start(&iter, root, min_level, start); \
- iter.valid && iter.gfn < end; \
+#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
+ iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, start, end) \
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
+ for (tdp_iter_start(&iter, root, min_level, 0, 0); \
+ iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, kvm, root, start, end) \
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn);
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2f15e0e33903..7f3d7229b2c1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -37,10 +37,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
* ultimately frees all roots.
*/
- kvm_tdp_mmu_invalidate_all_roots(kvm);
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
- WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+#ifdef CONFIG_KVM_PROVE_MMU
+ KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+#endif
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -53,6 +55,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
+ free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -91,19 +94,33 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
+ return false;
+
+ if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
+ return false;
+
+ if (likely(!is_mirror_sp(root)))
+ return types & KVM_DIRECT_ROOTS;
+ return types & KVM_MIRROR_ROOTS;
+}
+
/*
* Returns the next root after @prev_root (or the first root if @prev_root is
- * NULL). A reference to the returned root is acquired, and the reference to
- * @prev_root is released (the caller obviously must hold a reference to
- * @prev_root if it's non-NULL).
+ * NULL) that matches with @types. A reference to the returned root is
+ * acquired, and the reference to @prev_root is released (the caller obviously
+ * must hold a reference to @prev_root if it's non-NULL).
*
- * If @only_valid is true, invalid roots are skipped.
+ * Roots that doesn't match with @types are skipped.
*
* Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool only_valid)
+ enum kvm_tdp_mmu_root_types types)
{
struct kvm_mmu_page *next_root;
@@ -124,7 +141,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
typeof(*next_root), link);
while (next_root) {
- if ((!only_valid || !next_root->role.invalid) &&
+ if (tdp_mmu_root_match(next_root, types) &&
kvm_tdp_mmu_get_root(next_root))
break;
@@ -149,20 +166,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ _root = tdp_mmu_next_root(_kvm, _root, _types)) \
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, false))
+ _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -171,18 +188,28 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
- ((_only_valid) && (_root)->role.invalid))) { \
+ !tdp_mmu_root_match((_root), (_types)))) { \
} else
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */
+#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types))) { \
+ } else
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
@@ -223,7 +250,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role role = mmu->root_role;
@@ -231,6 +258,9 @@ int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
+ if (mirror)
+ role.is_mirror = true;
+
/*
* Check for an existing root before acquiring the pages lock to avoid
* unnecessary serialization if multiple vCPUs are loading a new root.
@@ -282,9 +312,12 @@ out_read_unlock:
* and actually consuming the root if it's invalidated after dropping
* mmu_lock, and the root can't be freed as this vCPU holds a reference.
*/
- mmu->root.hpa = __pa(root->spt);
- mmu->root.pgd = 0;
- return 0;
+ if (mirror) {
+ mmu->mirror_root_hpa = __pa(root->spt);
+ } else {
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ }
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -294,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
+#ifdef CONFIG_KVM_PROVE_MMU
atomic64_inc(&kvm->arch.tdp_mmu_pages);
+#endif
}
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
+#ifdef CONFIG_KVM_PROVE_MMU
atomic64_dec(&kvm->arch.tdp_mmu_pages);
+#endif
}
/**
@@ -322,6 +359,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
+static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ int level)
+{
+ kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
+ int ret;
+
+ /*
+ * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
+ * PTs are removed in a special order, involving free_external_spt().
+ * But remove_external_spte() will be called on non-leaf PTEs via
+ * __tdp_mmu_zap_root(), so avoid the error the former would return
+ * in this case.
+ */
+ if (!is_last_spte(old_spte, level))
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ /* Because write lock is held, operation should success. */
+ ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
+ KVM_BUG_ON(ret, kvm);
+}
+
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -417,11 +477,81 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
old_spte, FROZEN_SPTE, level, shared);
+
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->external_spt = NULL;
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
+{
+ if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ WARN_ON_ONCE(sp->role.level + 1 != level);
+ WARN_ON_ONCE(sp->gfn != gfn);
+ return sp->external_spt;
+ }
+
+ return NULL;
+}
+
+static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
+ gfn_t gfn, u64 old_spte,
+ u64 new_spte, int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
+ int ret = 0;
+
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ /*
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
+ */
+ if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
+ return -EBUSY;
+
+ /*
+ * Use different call to either set up middle level
+ * external page table, or leaf.
+ */
+ if (is_leaf) {
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
+ } else {
+ void *external_spt = get_external_spt(gfn, new_spte, level);
+
+ KVM_BUG_ON(!external_spt, kvm);
+ ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
+ }
+ if (ret)
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return ret;
+}
+
/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
@@ -522,11 +652,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
-static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
u64 new_spte)
{
- u64 *sptep = rcu_dereference(iter->sptep);
-
/*
* The caller is responsible for ensuring the old SPTE is not a FROZEN
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
@@ -535,15 +664,34 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
*/
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
- /*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
- * does not hold the mmu_lock. On failure, i.e. if a different logical
- * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
- * the current value, so the caller operates on fresh data, e.g. if it
- * retries tdp_mmu_set_spte_atomic()
- */
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
- return -EBUSY;
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ int ret;
+
+ /*
+ * Users of atomic zapping don't operate on mirror roots,
+ * so don't handle it and bug the VM if it's seen.
+ */
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EBUSY;
+
+ ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ if (ret)
+ return ret;
+ } else {
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
+ * and does not hold the mmu_lock. On failure, i.e. if a
+ * different logical CPU modified the SPTE, try_cmpxchg64()
+ * updates iter->old_spte with the current value, so the caller
+ * operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+ }
return 0;
}
@@ -573,7 +721,7 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- ret = __tdp_mmu_set_spte_atomic(iter, new_spte);
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
if (ret)
return ret;
@@ -613,6 +761,16 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+
+ /*
+ * Users that do non-atomic setting of PTEs don't operate on mirror
+ * roots, so don't handle it and bug the VM if it's seen.
+ */
+ if (is_mirror_sptep(sptep)) {
+ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+
return old_spte;
}
@@ -625,19 +783,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
iter->gfn, iter->level);
}
-#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root, _start, _end)
+#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
-#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
- tdp_root_for_each_pte(_iter, _root, _start, _end) \
+#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
if (!is_shadow_present_pte(_iter.old_spte) || \
!is_last_spte(_iter.old_spte, _iter.level)) \
continue; \
else
-#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
-
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -705,10 +860,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- gfn_t end = tdp_mmu_max_gfn_exclusive();
- gfn_t start = 0;
-
- for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+ for_each_tdp_pte_min_level_all(iter, root, zap_level) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -812,7 +964,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
@@ -863,19 +1015,21 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *root;
/*
- * Zap all roots, including invalid roots, as all SPTEs must be dropped
- * before returning to the caller. Zap directly even if the root is
- * also being zapped by a worker. Walking zapped top-level SPTEs isn't
- * all that expensive and mmu_lock is already held, which means the
- * worker has yielded, i.e. flushing the work instead of zapping here
- * isn't guaranteed to be any faster.
+ * Zap all direct roots, including invalid direct roots, as all direct
+ * SPTEs must be dropped before returning to the caller. For TDX, mirror
+ * roots don't need handling in response to the mmu notifier (the caller).
+ *
+ * Zap directly even if the root is also being zapped by a concurrent
+ * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
+ * and mmu_lock is already held, which means the other thread has yielded.
*
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
+ KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
tdp_mmu_zap_root(kvm, root, false);
}
@@ -883,11 +1037,14 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes.
*/
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
{
struct kvm_mmu_page *root;
- read_lock(&kvm->mmu_lock);
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
@@ -905,7 +1062,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* that may be zapped, as such entries are associated with the
* ASID on both VMX and SVM.
*/
- tdp_mmu_zap_root(kvm, root, true);
+ tdp_mmu_zap_root(kvm, root, shared);
/*
* The referenced needs to be put *after* zapping the root, as
@@ -915,7 +1072,10 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
kvm_tdp_mmu_put_root(kvm, root);
}
- read_unlock(&kvm->mmu_lock);
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
}
/*
@@ -928,11 +1088,19 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
* See kvm_tdp_mmu_alloc_root().
*/
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types)
{
struct kvm_mmu_page *root;
/*
+ * Invalidating invalid roots doesn't make sense, prevent developers from
+ * having to think about it.
+ */
+ if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
+ root_types &= ~KVM_INVALID_ROOTS;
+
+ /*
* mmu_lock must be held for write to ensure that a root doesn't become
* invalid while there are active readers (invalidating a root while
* there are active readers may or may not be problematic in practice,
@@ -953,6 +1121,9 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
* or get/put references to roots.
*/
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!tdp_mmu_root_match(root, root_types))
+ continue;
+
/*
* Note, invalid roots can outlive a memslot update! Invalid
* roots must be *zapped* before the memslot update completes,
@@ -982,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
- if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
- return RET_PF_SPURIOUS;
-
if (is_shadow_present_pte(iter->old_spte) &&
- is_access_allowed(fault, iter->old_spte) &&
- is_last_spte(iter->old_spte, iter->level))
+ (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ is_last_spte(iter->old_spte, iter->level)) {
+ WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
return RET_PF_SPURIOUS;
+ }
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
@@ -1068,7 +1238,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
*/
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
@@ -1080,7 +1250,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1107,13 +1277,18 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
+ if (is_mirror_sp(sp))
+ kvm_mmu_alloc_external_spt(vcpu, sp);
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- if (is_shadow_present_pte(iter.old_spte))
+ if (is_shadow_present_pte(iter.old_spte)) {
+ /* Don't support large page for mirrored roots (TDX) */
+ KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
- else
+ } else {
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ }
/*
* Force the guest to retry if installing an upper level SPTE
@@ -1148,12 +1323,16 @@ retry:
return ret;
}
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
- __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
@@ -1168,21 +1347,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
-static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
+static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
if (spte_ad_enabled(iter->old_spte)) {
- iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
- iter->old_spte,
- shadow_accessed_mask,
- iter->level);
+ iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+ shadow_accessed_mask);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
new_spte = mark_spte_for_access_track(iter->old_spte);
- iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
- iter->old_spte, new_spte,
- iter->level);
+ /*
+ * It is safe for the following cmpxchg to fail. Leave the
+ * Accessed bit set, as the spte is most likely young anyway.
+ */
+ if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+ return;
}
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@@ -1193,20 +1373,24 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
bool test_only)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
struct tdp_iter iter;
bool ret = false;
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
+
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code. Note,
* this helper must NOT be used to unmap GFNs, as it processes only
* valid roots!
*/
- for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
- guard(rcu)();
+ WARN_ON(types & ~KVM_VALID_ROOTS);
- tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+ guard(rcu)();
+ for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@@ -1214,7 +1398,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
return true;
ret = true;
- kvm_tdp_mmu_age_spte(&iter);
+ kvm_tdp_mmu_age_spte(kvm, &iter);
}
}
@@ -1247,7 +1431,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1366,7 +1550,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
* level above the target level (e.g. splitting a 1GB to 512 2MB pages,
* and then splitting each of those to 512 4KB pages).
*/
- for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -1445,26 +1629,26 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
}
}
-static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
+static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
{
/*
* All TDP MMU shadow pages share the same role as their root, aside
* from level, so it is valid to key off any shadow page to determine if
* write protection is needed for an entire tree.
*/
- return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
+ return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
}
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
- const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
struct tdp_iter iter;
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ tdp_root_for_each_pte(iter, kvm, root, start, end) {
retry:
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1504,15 +1688,15 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
- const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
+ tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
break;
@@ -1566,7 +1750,7 @@ static int tdp_mmu_make_huge_spte(struct kvm *kvm,
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
struct tdp_iter iter;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
/*
* Use the parent iterator when checking for forward progress so
* that KVM doesn't get stuck continuously trying to yield (i.e.
@@ -1600,7 +1784,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
@@ -1681,7 +1865,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -1726,17 +1910,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
*
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- int *root_level)
+static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ struct kvm_mmu_page *root)
{
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
- *root_level = vcpu->arch.mmu->root_role.level;
-
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1744,6 +1925,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
return leaf;
}
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+ *root_level = vcpu->arch.mmu->root_role.level;
+
+ return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
+}
+
+bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool is_direct = kvm_is_addr_direct(kvm, gpa);
+ hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
+ vcpu->arch.mmu->mirror_root_hpa;
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
+ int leaf;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ rcu_read_lock();
+ leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
+ rcu_read_unlock();
+ if (leaf < 0)
+ return false;
+
+ spte = sptes[leaf];
+ return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
+}
+EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped);
+
/*
* Returns the last level spte pointer of the shadow page walk for the given
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
@@ -1758,11 +1969,12 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
+ /* Fast pf is not supported for mirrored roots */
+ struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
tdp_ptep_t sptep = NULL;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index f03ca0dd13d9..52acf99d40a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
@@ -19,11 +19,56 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+enum kvm_tdp_mmu_root_types {
+ KVM_INVALID_ROOTS = BIT(0),
+ KVM_DIRECT_ROOTS = BIT(1),
+ KVM_MIRROR_ROOTS = BIT(2),
+ KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS,
+ KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS,
+};
+
+static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm,
+ enum kvm_gfn_range_filter process)
+{
+ enum kvm_tdp_mmu_root_types ret = 0;
+
+ if (!kvm_has_mirrored_tdp(kvm))
+ return KVM_DIRECT_ROOTS;
+
+ if (process & KVM_FILTER_PRIVATE)
+ ret |= KVM_MIRROR_ROOTS;
+ if (process & KVM_FILTER_SHARED)
+ ret |= KVM_DIRECT_ROOTS;
+
+ WARN_ON_ONCE(!ret);
+
+ return ret;
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
+ enum kvm_tdp_mmu_root_types type)
+{
+ if (unlikely(type == KVM_MIRROR_ROOTS))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);