summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c972
-rw-r--r--arch/x86/kvm/cpuid.h128
-rw-r--r--arch/x86/kvm/governed_features.h22
-rw-r--r--arch/x86/kvm/hyperv.c2
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.h33
-rw-r--r--arch/x86/kvm/mmu/mmu.c109
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h80
-rw-r--r--arch/x86/kvm/mmu/spte.h5
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c10
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c325
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h51
-rw-r--r--arch/x86/kvm/pmu.c1
-rw-r--r--arch/x86/kvm/reverse_cpuid.h23
-rw-r--r--arch/x86/kvm/smm.c10
-rw-r--r--arch/x86/kvm/svm/nested.c22
-rw-r--r--arch/x86/kvm/svm/pmu.c8
-rw-r--r--arch/x86/kvm/svm/sev.c45
-rw-r--r--arch/x86/kvm/svm/svm.c78
-rw-r--r--arch/x86/kvm/svm/svm.h21
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx/hyperv.h2
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h2
-rw-r--r--arch/x86/kvm/vmx/main.c4
-rw-r--r--arch/x86/kvm/vmx/nested.c102
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c4
-rw-r--r--arch/x86/kvm/vmx/sgx.c14
-rw-r--r--arch/x86/kvm/vmx/vmx.c176
-rw-r--r--arch/x86/kvm/vmx/vmx.h6
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h6
-rw-r--r--arch/x86/kvm/x86.c264
-rw-r--r--arch/x86/kvm/x86.h34
37 files changed, 1714 insertions, 921 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ae0b438a2c99..8eb3a88707f2 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -82,15 +82,6 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
-#define F feature_bit
-
-/* Scattered Flag - For features that are scattered by cpufeatures.h. */
-#define SF(name) \
-({ \
- BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
- (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \
-})
-
/*
* Magic value used by KVM when querying userspace-provided CPUID entries and
* doesn't care about the CPIUD index because the index of the function in
@@ -100,8 +91,8 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
*/
#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
-static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
- struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
+static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
+ u32 function, u64 index)
{
struct kvm_cpuid_entry2 *e;
int i;
@@ -118,8 +109,8 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
*/
lockdep_assert_irqs_enabled();
- for (i = 0; i < nent; i++) {
- e = &entries[i];
+ for (i = 0; i < vcpu->arch.cpuid_nent; i++) {
+ e = &vcpu->arch.cpuid_entries[i];
if (e->function != function)
continue;
@@ -151,9 +142,27 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
return NULL;
}
-static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid_entry2 *entries,
- int nent)
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ return cpuid_entry2_find(vcpu, function, index);
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function)
+{
+ return cpuid_entry2_find(vcpu, function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+/*
+ * cpuid_entry2_find() and KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used
+ * directly outside of kvm_find_cpuid_entry() and kvm_find_cpuid_entry_index().
+ */
+#undef KVM_CPUID_INDEX_NOT_SIGNIFICANT
+
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
u64 xfeatures;
@@ -162,8 +171,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
* The existing code assumes virtual address is 48-bit or 57-bit in the
* canonical address checks; exit if it is ever changed.
*/
- best = cpuid_entry2_find(entries, nent, 0x80000008,
- KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
if (best) {
int vaddr_bits = (best->eax & 0xff00) >> 8;
@@ -175,7 +183,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
* Exposing dynamic xfeatures to the guest requires additional
* enabling in the FPU, e.g. to expand the guest XSAVE state size.
*/
- best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0);
if (!best)
return 0;
@@ -187,6 +195,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
}
+static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu);
+
/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int nent)
@@ -194,6 +204,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
struct kvm_cpuid_entry2 *orig;
int i;
+ /*
+ * Apply runtime CPUID updates to the incoming CPUID entries to avoid
+ * false positives due mismatches on KVM-owned feature flags.
+ *
+ * Note! @e2 and @nent track the _old_ CPUID entries!
+ */
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_apply_cpuid_pv_features_quirk(vcpu);
+
if (nent != vcpu->arch.cpuid_nent)
return -EINVAL;
@@ -210,15 +229,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
return 0;
}
-static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries,
- int nent, const char *sig)
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
{
struct kvm_hypervisor_cpuid cpuid = {};
struct kvm_cpuid_entry2 *entry;
u32 base;
for_each_possible_hypervisor_cpuid_base(base) {
- entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ entry = kvm_find_cpuid_entry(vcpu, base);
if (entry) {
u32 signature[3];
@@ -238,118 +257,90 @@ static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_e
return cpuid;
}
-static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
- const char *sig)
+static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu)
{
- return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent, sig);
-}
-
-static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries,
- int nent, u32 kvm_cpuid_base)
-{
- return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES,
- KVM_CPUID_INDEX_NOT_SIGNIFICANT);
-}
-
-static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
-{
- u32 base = vcpu->arch.kvm_cpuid.base;
+ struct kvm_hypervisor_cpuid kvm_cpuid;
+ struct kvm_cpuid_entry2 *best;
- if (!base)
- return NULL;
+ kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ if (!kvm_cpuid.base)
+ return 0;
- return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent, base);
-}
+ best = kvm_find_cpuid_entry(vcpu, kvm_cpuid.base | KVM_CPUID_FEATURES);
+ if (!best)
+ return 0;
-void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu);
+ if (kvm_hlt_in_guest(vcpu->kvm))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
- /*
- * save the feature bitmap to avoid cpuid lookup for every PV
- * operation
- */
- if (best)
- vcpu->arch.pv_cpuid.features = best->eax;
+ return best->eax;
}
/*
* Calculate guest's supported XCR0 taking into account guest CPUID data and
* KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
*/
-static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
+static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0);
if (!best)
return 0;
return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
}
-static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
- int nent)
+static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool has_feature)
+{
+ cpuid_entry_change(entry, x86_feature, has_feature);
+ guest_cpu_cap_change(vcpu, x86_feature, has_feature);
+}
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- struct kvm_hypervisor_cpuid kvm_cpuid;
- best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ best = kvm_find_cpuid_entry(vcpu, 1);
if (best) {
- /* Update OSXSAVE bit */
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE,
kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
- cpuid_entry_change(best, X86_FEATURE_APIC,
- vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_APIC,
+ vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_MWAIT,
+ vcpu->arch.ia32_misc_enable_msr &
+ MSR_IA32_MISC_ENABLE_MWAIT);
}
- best = cpuid_entry2_find(entries, nent, 7, 0);
- if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
- cpuid_entry_change(best, X86_FEATURE_OSPKE,
- kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
+ best = kvm_find_cpuid_entry_index(vcpu, 7, 0);
+ if (best)
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSPKE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
+
- best = cpuid_entry2_find(entries, nent, 0xD, 0);
+ best = kvm_find_cpuid_entry_index(vcpu, 0xD, 0);
if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
- best = cpuid_entry2_find(entries, nent, 0xD, 1);
+ best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
-
- kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE);
- if (kvm_cpuid.base) {
- best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base);
- if (kvm_hlt_in_guest(vcpu->kvm) && best)
- best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
- }
-
- if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
- best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
- if (best)
- cpuid_entry_change(best, X86_FEATURE_MWAIT,
- vcpu->arch.ia32_misc_enable_msr &
- MSR_IA32_MISC_ENABLE_MWAIT);
- }
-}
-
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
-{
- __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
-static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
+static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_KVM_HYPERV
struct kvm_cpuid_entry2 *entry;
- entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
- KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
#else
return false;
@@ -368,15 +359,71 @@ static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
}
-static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+/*
+ * This isn't truly "unsafe", but except for the cpu_caps initialization code,
+ * all register lookups should use __cpuid_entry_get_reg(), which provides
+ * compile-time validation of the input.
+ */
+static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return entry->eax;
+ case CPUID_EBX:
+ return entry->ebx;
+ case CPUID_ECX:
+ return entry->ecx;
+ case CPUID_EDX:
+ return entry->edx;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
+ bool include_partially_emulated);
+
+void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ struct kvm_cpuid_entry2 *entry;
bool allow_gbpages;
+ int i;
+
+ memset(vcpu->arch.cpu_caps, 0, sizeof(vcpu->arch.cpu_caps));
+ BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS);
- BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES);
- bitmap_zero(vcpu->arch.governed_features.enabled,
- KVM_MAX_NR_GOVERNED_FEATURES);
+ /*
+ * Reset guest capabilities to userspace's guest CPUID definition, i.e.
+ * honor userspace's definition for features that don't require KVM or
+ * hardware management/support (or that KVM simply doesn't care about).
+ */
+ for (i = 0; i < NR_KVM_CPU_CAPS; i++) {
+ const struct cpuid_reg cpuid = reverse_cpuid[i];
+ struct kvm_cpuid_entry2 emulated;
+
+ if (!cpuid.function)
+ continue;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ continue;
+
+ cpuid_func_emulated(&emulated, cpuid.function, true);
+
+ /*
+ * A vCPU has a feature if it's supported by KVM and is enabled
+ * in guest CPUID. Note, this includes features that are
+ * supported by KVM but aren't advertised to userspace!
+ */
+ vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] |
+ cpuid_get_reg_unsafe(&emulated, cpuid.reg);
+ vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, cpuid.reg);
+ }
+
+ kvm_update_cpuid_runtime(vcpu);
/*
* If TDP is enabled, let the guest use GBPAGES if they're supported in
@@ -390,9 +437,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* and can install smaller shadow pages if the host lacks 1GiB support.
*/
allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
- if (allow_gbpages)
- kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES);
+ guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages);
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
@@ -404,21 +450,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
- vcpu->arch.guest_supported_xcr0 =
- cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+ vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu);
- kvm_update_pv_runtime(vcpu);
+ vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu);
vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
kvm_pmu_refresh(vcpu);
- vcpu->arch.cr4_guest_rsvd_bits =
- __cr4_reserved_bits(guest_cpuid_has, vcpu);
- kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent));
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) |
+ __cr4_reserved_bits(guest_cpu_cap_has, vcpu);
+#undef __kvm_cpu_cap_has
+
+ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu));
/* Invoke the vendor callback only after the above state is updated. */
kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
@@ -457,9 +504,25 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int nent)
{
+ u32 vcpu_caps[NR_KVM_CPU_CAPS];
int r;
- __kvm_update_cpuid_runtime(vcpu, e2, nent);
+ /*
+ * Swap the existing (old) entries with the incoming (new) entries in
+ * order to massage the new entries, e.g. to account for dynamic bits
+ * that KVM controls, without clobbering the current guest CPUID, which
+ * KVM needs to preserve in order to unwind on failure.
+ *
+ * Similarly, save the vCPU's current cpu_caps so that the capabilities
+ * can be updated alongside the CPUID entries when performing runtime
+ * updates. Full initialization is done if and only if the vCPU hasn't
+ * run, i.e. only if userspace is potentially changing CPUID features.
+ */
+ swap(vcpu->arch.cpuid_entries, e2);
+ swap(vcpu->arch.cpuid_nent, nent);
+
+ memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps));
+ BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps));
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
@@ -475,35 +538,36 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
if (kvm_vcpu_has_run(vcpu)) {
r = kvm_cpuid_check_equal(vcpu, e2, nent);
if (r)
- return r;
-
- kvfree(e2);
- return 0;
+ goto err;
+ goto success;
}
#ifdef CONFIG_KVM_HYPERV
- if (kvm_cpuid_has_hyperv(e2, nent)) {
+ if (kvm_cpuid_has_hyperv(vcpu)) {
r = kvm_hv_vcpu_init(vcpu);
if (r)
- return r;
+ goto err;
}
#endif
- r = kvm_check_cpuid(vcpu, e2, nent);
+ r = kvm_check_cpuid(vcpu);
if (r)
- return r;
-
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = nent;
+ goto err;
- vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
#ifdef CONFIG_KVM_XEN
vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
#endif
kvm_vcpu_after_set_cpuid(vcpu);
+success:
+ kvfree(e2);
return 0;
+
+err:
+ memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps));
+ swap(vcpu->arch.cpuid_entries, e2);
+ swap(vcpu->arch.cpuid_nent, nent);
+ return r;
}
/* when an old userspace process fills a new kernel module */
@@ -590,107 +654,294 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
return 0;
}
-/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
-static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
+static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid)
{
- const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
struct kvm_cpuid_entry2 entry;
+ u32 base;
- reverse_cpuid_check(leaf);
+ /*
+ * KVM only supports features defined by Intel (0x0), AMD (0x80000000),
+ * and Centaur (0xc0000000). WARN if a feature for new vendor base is
+ * defined, as this and other code would need to be updated.
+ */
+ base = cpuid.function & 0xffff0000;
+ if (WARN_ON_ONCE(base && base != 0x80000000 && base != 0xc0000000))
+ return 0;
+
+ if (cpuid_eax(base) < cpuid.function)
+ return 0;
cpuid_count(cpuid.function, cpuid.index,
&entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
- kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
+ return *__cpuid_entry_get_reg(&entry, cpuid.reg);
}
-static __always_inline
-void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
-{
- /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
- BUILD_BUG_ON(leaf < NCAPINTS);
+/*
+ * For kernel-defined leafs, mask KVM's supported feature set with the kernel's
+ * capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw
+ * CPUID, as KVM is the one and only authority (in the kernel).
+ */
+#define kvm_cpu_cap_init(leaf, feature_initializers...) \
+do { \
+ const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \
+ const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \
+ const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \
+ u32 kvm_cpu_cap_passthrough = 0; \
+ u32 kvm_cpu_cap_synthesized = 0; \
+ u32 kvm_cpu_cap_emulated = 0; \
+ u32 kvm_cpu_cap_features = 0; \
+ \
+ feature_initializers \
+ \
+ kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \
+ \
+ if (leaf < NCAPINTS) \
+ kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \
+ \
+ kvm_cpu_caps[leaf] |= kvm_cpu_cap_passthrough; \
+ kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) | \
+ kvm_cpu_cap_synthesized); \
+ kvm_cpu_caps[leaf] |= kvm_cpu_cap_emulated; \
+} while (0)
- kvm_cpu_caps[leaf] = mask;
+/*
+ * Assert that the feature bit being declared, e.g. via F(), is in the CPUID
+ * word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX
+ * features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX.
+ */
+#define KVM_VALIDATE_CPU_CAP_USAGE(name) \
+do { \
+ u32 __leaf = __feature_leaf(X86_FEATURE_##name); \
+ \
+ BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \
+} while (0)
+
+#define F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ kvm_cpu_cap_features |= feature_bit(name); \
+})
- __kvm_cpu_cap_mask(leaf);
-}
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SCATTERED_F(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ if (boot_cpu_has(X86_FEATURE_##name)) \
+ F(name); \
+})
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
-{
- /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
- BUILD_BUG_ON(leaf >= NCAPINTS);
+/* Features that KVM supports only on 64-bit kernels. */
+#define X86_64_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ if (IS_ENABLED(CONFIG_X86_64)) \
+ F(name); \
+})
- kvm_cpu_caps[leaf] &= mask;
+/*
+ * Emulated Feature - For features that KVM emulates in software irrespective
+ * of host CPU/kernel support.
+ */
+#define EMULATED_F(name) \
+({ \
+ kvm_cpu_cap_emulated |= feature_bit(name); \
+ F(name); \
+})
- __kvm_cpu_cap_mask(leaf);
-}
+/*
+ * Synthesized Feature - For features that are synthesized into boot_cpu_data,
+ * i.e. may not be present in the raw CPUID, but can still be advertised to
+ * userspace. Primarily used for mitigation related feature flags.
+ */
+#define SYNTHESIZED_F(name) \
+({ \
+ kvm_cpu_cap_synthesized |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Passthrough Feature - For features that KVM supports based purely on raw
+ * hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't
+ * use the feature. Simply force set the feature in KVM's capabilities, raw
+ * CPUID support will be factored in by kvm_cpu_cap_mask().
+ */
+#define PASSTHROUGH_F(name) \
+({ \
+ kvm_cpu_cap_passthrough |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Aliased Features - For features in 0x8000_0001.EDX that are duplicates of
+ * identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001.
+ */
+#define ALIASED_1_EDX_F(name) \
+({ \
+ BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \
+ BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \
+ kvm_cpu_cap_features |= feature_bit(name); \
+})
+
+/*
+ * Vendor Features - For features that KVM supports, but are added in later
+ * because they require additional vendor enabling.
+ */
+#define VENDOR_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+})
+
+/*
+ * Runtime Features - For features that KVM dynamically sets/clears at runtime,
+ * e.g. when CR4 changes, but which are never advertised to userspace.
+ */
+#define RUNTIME_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+})
+
+/*
+ * Undefine the MSR bit macro to avoid token concatenation issues when
+ * processing X86_FEATURE_SPEC_CTRL_SSBD.
+ */
+#undef SPEC_CTRL_SSBD
+
+/* DS is defined by ptrace-abi.h on 32-bit builds. */
+#undef DS
void kvm_set_cpu_caps(void)
{
-#ifdef CONFIG_X86_64
- unsigned int f_gbpages = F(GBPAGES);
- unsigned int f_lm = F(LM);
- unsigned int f_xfd = F(XFD);
-#else
- unsigned int f_gbpages = 0;
- unsigned int f_lm = 0;
- unsigned int f_xfd = 0;
-#endif
memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
sizeof(boot_cpu_data.x86_capability));
- memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
- sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
-
- kvm_cpu_cap_mask(CPUID_1_ECX,
+ kvm_cpu_cap_init(CPUID_1_ECX,
+ F(XMM3),
+ F(PCLMULQDQ),
+ VENDOR_F(DTES64),
/*
* NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
- * advertised to guests via CPUID!
+ * advertised to guests via CPUID! MWAIT is also technically a
+ * runtime flag thanks to IA32_MISC_ENABLES; mark it as such so
+ * that KVM is aware that it's a known, unadvertised flag.
*/
- F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
- 0 /* DS-CPL, VMX, SMX, EST */ |
- 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
- F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C) | F(RDRAND)
+ RUNTIME_F(MWAIT),
+ /* DS-CPL */
+ VENDOR_F(VMX),
+ /* SMX, EST */
+ /* TM2 */
+ F(SSSE3),
+ /* CNXT-ID */
+ /* Reserved */
+ F(FMA),
+ F(CX16),
+ /* xTPR Update */
+ F(PDCM),
+ F(PCID),
+ /* Reserved, DCA */
+ F(XMM4_1),
+ F(XMM4_2),
+ EMULATED_F(X2APIC),
+ F(MOVBE),
+ F(POPCNT),
+ EMULATED_F(TSC_DEADLINE_TIMER),
+ F(AES),
+ F(XSAVE),
+ RUNTIME_F(OSXSAVE),
+ F(AVX),
+ F(F16C),
+ F(RDRAND),
+ EMULATED_F(HYPERVISOR),
);
- /* KVM emulates x2apic in software irrespective of host support. */
- kvm_cpu_cap_set(X86_FEATURE_X2APIC);
-
- kvm_cpu_cap_mask(CPUID_1_EDX,
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
- 0 /* Reserved, DS, ACPI */ | F(MMX) |
- F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
- 0 /* HTT, TM, Reserved, PBE */
+
+ kvm_cpu_cap_init(CPUID_1_EDX,
+ F(FPU),
+ F(VME),
+ F(DE),
+ F(PSE),
+ F(TSC),
+ F(MSR),
+ F(PAE),
+ F(MCE),
+ F(CX8),
+ F(APIC),
+ /* Reserved */
+ F(SEP),
+ F(MTRR),
+ F(PGE),
+ F(MCA),
+ F(CMOV),
+ F(PAT),
+ F(PSE36),
+ /* PSN */
+ F(CLFLUSH),
+ /* Reserved */
+ VENDOR_F(DS),
+ /* ACPI */
+ F(MMX),
+ F(FXSR),
+ F(XMM),
+ F(XMM2),
+ F(SELFSNOOP),
+ /* HTT, TM, Reserved, PBE */
);
- kvm_cpu_cap_mask(CPUID_7_0_EBX,
- F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
- F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
- F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
- F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
- F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
- F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
- F(AVX512VL));
-
- kvm_cpu_cap_mask(CPUID_7_ECX,
- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
- F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
- F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
- F(SGX_LC) | F(BUS_LOCK_DETECT)
+ kvm_cpu_cap_init(CPUID_7_0_EBX,
+ F(FSGSBASE),
+ EMULATED_F(TSC_ADJUST),
+ F(SGX),
+ F(BMI1),
+ F(HLE),
+ F(AVX2),
+ F(FDP_EXCPTN_ONLY),
+ F(SMEP),
+ F(BMI2),
+ F(ERMS),
+ F(INVPCID),
+ F(RTM),
+ F(ZERO_FCS_FDS),
+ VENDOR_F(MPX),
+ F(AVX512F),
+ F(AVX512DQ),
+ F(RDSEED),
+ F(ADX),
+ F(SMAP),
+ F(AVX512IFMA),
+ F(CLFLUSHOPT),
+ F(CLWB),
+ VENDOR_F(INTEL_PT),
+ F(AVX512PF),
+ F(AVX512ER),
+ F(AVX512CD),
+ F(SHA_NI),
+ F(AVX512BW),
+ F(AVX512VL),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_ECX,
+ F(AVX512VBMI),
+ PASSTHROUGH_F(LA57),
+ F(PKU),
+ RUNTIME_F(OSPKE),
+ F(RDPID),
+ F(AVX512_VPOPCNTDQ),
+ F(UMIP),
+ F(AVX512_VBMI2),
+ F(GFNI),
+ F(VAES),
+ F(VPCLMULQDQ),
+ F(AVX512_VNNI),
+ F(AVX512_BITALG),
+ F(CLDEMOTE),
+ F(MOVDIRI),
+ F(MOVDIR64B),
+ VENDOR_F(WAITPKG),
+ F(SGX_LC),
+ F(BUS_LOCK_DETECT),
);
- /* Set LA57 based on hardware capability. */
- if (cpuid_ecx(7) & F(LA57))
- kvm_cpu_cap_set(X86_FEATURE_LA57);
/*
* PKU not yet implemented for shadow paging and requires OSPKE
@@ -699,18 +950,25 @@ void kvm_set_cpu_caps(void)
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
kvm_cpu_cap_clear(X86_FEATURE_PKU);
- kvm_cpu_cap_mask(CPUID_7_EDX,
- F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
- F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
- F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
- F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
+ kvm_cpu_cap_init(CPUID_7_EDX,
+ F(AVX512_4VNNIW),
+ F(AVX512_4FMAPS),
+ F(SPEC_CTRL),
+ F(SPEC_CTRL_SSBD),
+ EMULATED_F(ARCH_CAPABILITIES),
+ F(INTEL_STIBP),
+ F(MD_CLEAR),
+ F(AVX512_VP2INTERSECT),
+ F(FSRM),
+ F(SERIALIZE),
+ F(TSXLDTRK),
+ F(AVX512_FP16),
+ F(AMX_TILE),
+ F(AMX_INT8),
+ F(AMX_BF16),
+ F(FLUSH_L1D),
);
- /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
- kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST);
- kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES);
-
if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) &&
boot_cpu_has(X86_FEATURE_AMD_IBPB) &&
boot_cpu_has(X86_FEATURE_AMD_IBRS))
@@ -720,65 +978,133 @@ void kvm_set_cpu_caps(void)
if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
- kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(SHA512) | F(SM3) | F(SM4) | F(AVX_VNNI) | F(AVX512_BF16) |
- F(CMPCCXADD) | F(FZRM) | F(FSRS) | F(FSRC) | F(AMX_FP16) |
- F(AVX_IFMA) | F(LAM)
+ kvm_cpu_cap_init(CPUID_7_1_EAX,
+ F(SHA512),
+ F(SM3),
+ F(SM4),
+ F(AVX_VNNI),
+ F(AVX512_BF16),
+ F(CMPCCXADD),
+ F(FZRM),
+ F(FSRS),
+ F(FSRC),
+ F(AMX_FP16),
+ F(AVX_IFMA),
+ F(LAM),
);
- kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
- F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(AMX_COMPLEX) |
- F(AVX_VNNI_INT16) | F(PREFETCHITI) | F(AVX10)
+ kvm_cpu_cap_init(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8),
+ F(AVX_NE_CONVERT),
+ F(AMX_COMPLEX),
+ F(AVX_VNNI_INT16),
+ F(PREFETCHITI),
+ F(AVX10),
);
- kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
- F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) |
- F(BHI_CTRL) | F(MCDT_NO)
+ kvm_cpu_cap_init(CPUID_7_2_EDX,
+ F(INTEL_PSFD),
+ F(IPRED_CTRL),
+ F(RRSBA_CTRL),
+ F(DDPD_U),
+ F(BHI_CTRL),
+ F(MCDT_NO),
);
- kvm_cpu_cap_mask(CPUID_D_1_EAX,
- F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
+ kvm_cpu_cap_init(CPUID_D_1_EAX,
+ F(XSAVEOPT),
+ F(XSAVEC),
+ F(XGETBV1),
+ F(XSAVES),
+ X86_64_F(XFD),
);
- kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
- SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
+ kvm_cpu_cap_init(CPUID_12_EAX,
+ SCATTERED_F(SGX1),
+ SCATTERED_F(SGX2),
+ SCATTERED_F(SGX_EDECCSSA),
);
- kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX,
- F(AVX10_128) | F(AVX10_256) | F(AVX10_512)
+ kvm_cpu_cap_init(CPUID_24_0_EBX,
+ F(AVX10_128),
+ F(AVX10_256),
+ F(AVX10_512),
);
- kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
- F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
- F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT) | 0 /* PERFCTR_CORE */
+ kvm_cpu_cap_init(CPUID_8000_0001_ECX,
+ F(LAHF_LM),
+ F(CMP_LEGACY),
+ VENDOR_F(SVM),
+ /* ExtApicSpace */
+ F(CR8_LEGACY),
+ F(ABM),
+ F(SSE4A),
+ F(MISALIGNSSE),
+ F(3DNOWPREFETCH),
+ F(OSVW),
+ /* IBS */
+ F(XOP),
+ /* SKINIT, WDT, LWP */
+ F(FMA4),
+ F(TBM),
+ F(TOPOEXT),
+ VENDOR_F(PERFCTR_CORE),
);
- kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* Reserved */ |
- F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
- 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
+ kvm_cpu_cap_init(CPUID_8000_0001_EDX,
+ ALIASED_1_EDX_F(FPU),
+ ALIASED_1_EDX_F(VME),
+ ALIASED_1_EDX_F(DE),
+ ALIASED_1_EDX_F(PSE),
+ ALIASED_1_EDX_F(TSC),
+ ALIASED_1_EDX_F(MSR),
+ ALIASED_1_EDX_F(PAE),
+ ALIASED_1_EDX_F(MCE),
+ ALIASED_1_EDX_F(CX8),
+ ALIASED_1_EDX_F(APIC),
+ /* Reserved */
+ F(SYSCALL),
+ ALIASED_1_EDX_F(MTRR),
+ ALIASED_1_EDX_F(PGE),
+ ALIASED_1_EDX_F(MCA),
+ ALIASED_1_EDX_F(CMOV),
+ ALIASED_1_EDX_F(PAT),
+ ALIASED_1_EDX_F(PSE36),
+ /* Reserved */
+ F(NX),
+ /* Reserved */
+ F(MMXEXT),
+ ALIASED_1_EDX_F(MMX),
+ ALIASED_1_EDX_F(FXSR),
+ F(FXSR_OPT),
+ X86_64_F(GBPAGES),
+ F(RDTSCP),
+ /* Reserved */
+ X86_64_F(LM),
+ F(3DNOWEXT),
+ F(3DNOW),
);
if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
- kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX,
- SF(CONSTANT_TSC)
+ kvm_cpu_cap_init(CPUID_8000_0007_EDX,
+ SCATTERED_F(CONSTANT_TSC),
);
- kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
- F(CLZERO) | F(XSAVEERPTR) |
- F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
- F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
- F(AMD_PSFD) | F(AMD_IBPB_RET)
+ kvm_cpu_cap_init(CPUID_8000_0008_EBX,
+ F(CLZERO),
+ F(XSAVEERPTR),
+ F(WBNOINVD),
+ F(AMD_IBPB),
+ F(AMD_IBRS),
+ F(AMD_SSBD),
+ F(VIRT_SSBD),
+ F(AMD_SSB_NO),
+ F(AMD_STIBP),
+ F(AMD_STIBP_ALWAYS_ON),
+ F(AMD_PSFD),
+ F(AMD_IBPB_RET),
);
/*
@@ -808,50 +1134,73 @@ void kvm_set_cpu_caps(void)
!boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
- /*
- * Hide all SVM features by default, SVM will set the cap bits for
- * features it emulates and/or exposes for L1.
- */
- kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
-
- kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
- 0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ |
- F(SME_COHERENT));
+ /* All SVM features required additional vendor module enabling. */
+ kvm_cpu_cap_init(CPUID_8000_000A_EDX,
+ VENDOR_F(NPT),
+ VENDOR_F(VMCBCLEAN),
+ VENDOR_F(FLUSHBYASID),
+ VENDOR_F(NRIPS),
+ VENDOR_F(TSCRATEMSR),
+ VENDOR_F(V_VMSAVE_VMLOAD),
+ VENDOR_F(LBRV),
+ VENDOR_F(PAUSEFILTER),
+ VENDOR_F(PFTHRESHOLD),
+ VENDOR_F(VGIF),
+ VENDOR_F(VNMI),
+ VENDOR_F(SVME_ADDR_CHK),
+ );
- kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
- F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
- F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
- F(WRMSR_XX_BASE_NS)
+ kvm_cpu_cap_init(CPUID_8000_001F_EAX,
+ VENDOR_F(SME),
+ VENDOR_F(SEV),
+ /* VM_PAGE_FLUSH */
+ VENDOR_F(SEV_ES),
+ F(SME_COHERENT),
);
- kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
- kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
- kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
+ kvm_cpu_cap_init(CPUID_8000_0021_EAX,
+ F(NO_NESTED_DATA_BP),
+ /*
+ * Synthesize "LFENCE is serializing" into the AMD-defined entry
+ * in KVM's supported CPUID, i.e. if the feature is reported as
+ * supported by the kernel. LFENCE_RDTSC was a Linux-defined
+ * synthetic feature long before AMD joined the bandwagon, e.g.
+ * LFENCE is serializing on most CPUs that support SSE2. On
+ * CPUs that don't support AMD's leaf, ANDing with the raw host
+ * CPUID will drop the flags, and reporting support in AMD's
+ * leaf can make it easier for userspace to detect the feature.
+ */
+ SYNTHESIZED_F(LFENCE_RDTSC),
+ /* SmmPgCfgLock */
+ F(NULL_SEL_CLR_BASE),
+ F(AUTOIBRS),
+ EMULATED_F(NO_SMM_CTL_MSR),
+ /* PrefetchCtlMsr */
+ F(WRMSR_XX_BASE_NS),
+ SYNTHESIZED_F(SBPB),
+ SYNTHESIZED_F(IBPB_BRTYPE),
+ SYNTHESIZED_F(SRSO_NO),
+ F(SRSO_USER_KERNEL_NO),
+ );
- kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
- F(PERFMON_V2)
+ kvm_cpu_cap_init(CPUID_8000_0022_EAX,
+ F(PERFMON_V2),
);
- /*
- * Synthesize "LFENCE is serializing" into the AMD-defined entry in
- * KVM's supported CPUID if the feature is reported as supported by the
- * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long
- * before AMD joined the bandwagon, e.g. LFENCE is serializing on most
- * CPUs that support SSE2. On CPUs that don't support AMD's leaf,
- * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing
- * the mask with the raw host CPUID, and reporting support in AMD's
- * leaf can make it easier for userspace to detect the feature.
- */
- if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
- kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC);
if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
- kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR);
- kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
- F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
- F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
- F(PMM) | F(PMM_EN)
+ kvm_cpu_cap_init(CPUID_C000_0001_EDX,
+ F(XSTORE),
+ F(XSTORE_EN),
+ F(XCRYPT),
+ F(XCRYPT_EN),
+ F(ACE2),
+ F(ACE2_EN),
+ F(PHE),
+ F(PHE_EN),
+ F(PMM),
+ F(PMM_EN),
);
/*
@@ -871,6 +1220,16 @@ void kvm_set_cpu_caps(void)
}
EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
+#undef F
+#undef SCATTERED_F
+#undef X86_64_F
+#undef EMULATED_F
+#undef SYNTHESIZED_F
+#undef PASSTHROUGH_F
+#undef ALIASED_1_EDX_F
+#undef VENDOR_F
+#undef RUNTIME_F
+
struct kvm_cpuid_array {
struct kvm_cpuid_entry2 *entries;
int maxnent;
@@ -928,14 +1287,11 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
return entry;
}
-static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
+static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
+ bool include_partially_emulated)
{
- struct kvm_cpuid_entry2 *entry;
-
- if (array->nent >= array->maxnent)
- return -E2BIG;
+ memset(entry, 0, sizeof(*entry));
- entry = &array->entries[array->nent];
entry->function = func;
entry->index = 0;
entry->flags = 0;
@@ -943,23 +1299,37 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
switch (func) {
case 0:
entry->eax = 7;
- ++array->nent;
- break;
+ return 1;
case 1:
- entry->ecx = F(MOVBE);
- ++array->nent;
- break;
+ entry->ecx = feature_bit(MOVBE);
+ /*
+ * KVM allows userspace to enumerate MONITOR+MWAIT support to
+ * the guest, but the MWAIT feature flag is never advertised
+ * to userspace because MONITOR+MWAIT aren't virtualized by
+ * hardware, can't be faithfully emulated in software (KVM
+ * emulates them as NOPs), and allowing the guest to execute
+ * them natively requires enabling a per-VM capability.
+ */
+ if (include_partially_emulated)
+ entry->ecx |= feature_bit(MWAIT);
+ return 1;
case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
entry->eax = 0;
if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
- entry->ecx = F(RDPID);
- ++array->nent;
- break;
+ entry->ecx = feature_bit(RDPID);
+ return 1;
default:
- break;
+ return 0;
}
+}
+
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
+{
+ if (array->nent >= array->maxnent)
+ return -E2BIG;
+ array->nent += cpuid_func_emulated(&array->entries[array->nent], func, false);
return 0;
}
@@ -1103,7 +1473,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
cpuid_entry_override(entry, CPUID_D_1_EAX);
- if (entry->eax & (F(XSAVES)|F(XSAVEC)))
+ if (entry->eax & (feature_bit(XSAVES) | feature_bit(XSAVEC)))
entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
true);
else {
@@ -1540,22 +1910,6 @@ out_free:
return r;
}
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
- function, index);
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function)
-{
- return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
- function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-
/*
* Intel CPUID semantics treats any query for an out-of-range leaf as if the
* highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics
@@ -1648,10 +2002,10 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u64 data;
if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
(data & TSX_CTRL_CPUID_CLEAR))
- *ebx &= ~(F(RTM) | F(HLE));
+ *ebx &= ~(feature_bit(RTM) | feature_bit(HLE));
} else if (function == 0x80000007) {
if (kvm_hv_invtsc_suppressed(vcpu))
- *edx &= ~SF(CONSTANT_TSC);
+ *edx &= ~feature_bit(CONSTANT_TSC);
}
} else {
*eax = *ebx = *ecx = *edx = 0;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f16a7b2c2adc..67d80aa72d50 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -10,8 +10,8 @@
extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
+void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
-void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
u32 function, u32 index);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -67,41 +67,40 @@ static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
*reg = kvm_cpu_caps[leaf];
}
-static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
- unsigned int x86_feature)
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
{
const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
struct kvm_cpuid_entry2 *entry;
+ u32 *reg;
+
+ /*
+ * XSAVES is a special snowflake. Due to lack of a dedicated intercept
+ * on SVM, KVM must assume that XSAVES (and thus XRSTORS) is usable by
+ * the guest if the host supports XSAVES and *XSAVE* is exposed to the
+ * guest. Because the guest can execute XSAVES and XRSTORS, i.e. can
+ * indirectly consume XSS, KVM must ensure XSS is zeroed when running
+ * the guest, i.e. must set XSAVES in vCPU capabilities. But to reject
+ * direct XSS reads and writes (to minimize the virtualization hole and
+ * honor userspace's CPUID), KVM needs to check the raw guest CPUID,
+ * not KVM's view of guest capabilities.
+ *
+ * For all other features, guest capabilities are accurate. Expand
+ * this allowlist with extreme vigilance.
+ */
+ BUILD_BUG_ON(x86_feature != X86_FEATURE_XSAVES);
entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
if (!entry)
return NULL;
- return __cpuid_entry_get_reg(entry, cpuid.reg);
-}
-
-static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
- unsigned int x86_feature)
-{
- u32 *reg;
-
- reg = guest_cpuid_get_register(vcpu, x86_feature);
+ reg = __cpuid_entry_get_reg(entry, cpuid.reg);
if (!reg)
return false;
return *reg & __feature_bit(x86_feature);
}
-static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
- unsigned int x86_feature)
-{
- u32 *reg;
-
- reg = guest_cpuid_get_register(vcpu, x86_feature);
- if (reg)
- *reg &= ~__feature_bit(x86_feature);
-}
-
static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
{
return vcpu->arch.is_amd_compatible;
@@ -150,21 +149,6 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
return x86_stepping(best->eax);
}
-static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
-{
- return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
- guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
- guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
-}
-
-static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
-{
- return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
- guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) ||
- guest_cpuid_has(vcpu, X86_FEATURE_SBPB));
-}
-
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
{
return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
@@ -180,7 +164,6 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
- reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
}
@@ -188,7 +171,6 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
- reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
}
@@ -196,7 +178,6 @@ static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
- reverse_cpuid_check(x86_leaf);
return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
}
@@ -220,58 +201,61 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
}
-enum kvm_governed_features {
-#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
-#include "governed_features.h"
- KVM_NR_GOVERNED_FEATURES
-};
-
-static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
+static __always_inline void guest_cpu_cap_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
{
- switch (x86_feature) {
-#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
-#include "governed_features.h"
- default:
- return -1;
- }
-}
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
-static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature)
-{
- return kvm_governed_feature_index(x86_feature) >= 0;
+ vcpu->arch.cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
}
-static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
- unsigned int x86_feature)
+static __always_inline void guest_cpu_cap_clear(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
{
- BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
- __set_bit(kvm_governed_feature_index(x86_feature),
- vcpu->arch.governed_features.enabled);
+ vcpu->arch.cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
}
-static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
- unsigned int x86_feature)
+static __always_inline void guest_cpu_cap_change(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature,
+ bool guest_has_cap)
{
- if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature))
- kvm_governed_feature_set(vcpu, x86_feature);
+ if (guest_has_cap)
+ guest_cpu_cap_set(vcpu, x86_feature);
+ else
+ guest_cpu_cap_clear(vcpu, x86_feature);
}
-static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
- unsigned int x86_feature)
+static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
{
- BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
- return test_bit(kvm_governed_feature_index(x86_feature),
- vcpu->arch.governed_features.enabled);
+ return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature);
}
static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- if (guest_can_use(vcpu, X86_FEATURE_LAM))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LAM))
cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
return kvm_vcpu_is_legal_gpa(vcpu, cr3);
}
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB));
+}
+
#endif
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
deleted file mode 100644
index ad463b1ed4e4..000000000000
--- a/arch/x86/kvm/governed_features.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
-BUILD_BUG()
-#endif
-
-#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
-
-KVM_GOVERNED_X86_FEATURE(GBPAGES)
-KVM_GOVERNED_X86_FEATURE(XSAVES)
-KVM_GOVERNED_X86_FEATURE(VMX)
-KVM_GOVERNED_X86_FEATURE(NRIPS)
-KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
-KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
-KVM_GOVERNED_X86_FEATURE(LBRV)
-KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
-KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
-KVM_GOVERNED_X86_FEATURE(VGIF)
-KVM_GOVERNED_X86_FEATURE(VNMI)
-KVM_GOVERNED_X86_FEATURE(LAM)
-
-#undef KVM_GOVERNED_X86_FEATURE
-#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4f0a94346d00..6a6dd5a84f22 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1352,7 +1352,7 @@ static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
return;
if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) ||
- !guest_cpuid_has(vcpu, X86_FEATURE_XSAVEC))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC))
return;
pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. "
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index cd57a517d04a..d7ab8780ab9e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pid_nr = pid_vnr(pid);
put_pid(pid);
- pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+ pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr);
if (IS_ERR(pit->worker))
goto fail_kthread;
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 10495fffb890..73072585e164 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -88,6 +88,8 @@ struct x86_instruction_info {
#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
+/* Emulation during event vectoring is unhandleable. */
+#define X86EMUL_UNHANDLEABLE_VECTORING 7
/* x86-specific emulation flags */
#define X86EMUL_F_WRITE BIT(0)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3c83951c619e..a009c94c26c2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -598,7 +598,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
* version first and level-triggered interrupts never get EOIed in
* IOAPIC.
*/
- if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
!ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
@@ -734,10 +734,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
if (unlikely(apic->apicv_active)) {
- /* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- kvm_x86_call(hwapic_irr_update)(apic->vcpu,
- apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -763,7 +760,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(apic->apicv_active))
- kvm_x86_call(hwapic_isr_update)(vec);
+ kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -808,7 +805,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(apic->apicv_active))
- kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -816,6 +813,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
}
}
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
+ return;
+
+ kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
+
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
/* This may race with setting of irr in __apic_accept_irq() and
@@ -2357,7 +2365,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVTT:
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
- val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+ val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask);
kvm_lapic_set_reg(apic, APIC_LVTT, val);
apic_update_lvtt(apic);
break;
@@ -2634,7 +2642,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
return 0;
u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
- (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+ (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
return 1;
@@ -2805,8 +2813,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic_update_ppr(apic);
if (apic->apicv_active) {
kvm_x86_call(apicv_post_state_restore)(vcpu);
- kvm_x86_call(hwapic_irr_update)(vcpu, -1);
- kvm_x86_call(hwapic_isr_update)(-1);
+ kvm_x86_call(hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -3121,9 +3128,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
if (apic->apicv_active) {
kvm_x86_call(apicv_post_state_restore)(vcpu);
- kvm_x86_call(hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
- kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 24add38beaf0..1a8553ebdb42 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -118,6 +118,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated);
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9322358678b..050a0e229a4d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
+ /*
+ * Checking root.hpa is sufficient even when KVM has mirror root.
+ * We can have either:
+ * (1) mirror_root_hpa = INVALID_PAGE, root.hpa = INVALID_PAGE
+ * (2) mirror_root_hpa = root, root.hpa = INVALID_PAGE
+ * (3) mirror_root_hpa = root1, root.hpa = root2
+ * We don't ever have:
+ * mirror_root_hpa = INVALID_PAGE, root.hpa = root
+ */
if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
@@ -126,7 +135,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu)
{
- if (!guest_can_use(vcpu, X86_FEATURE_LAM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LAM))
return 0;
return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
@@ -287,4 +296,26 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
return gpa;
return translate_nested_gpa(vcpu, gpa, access, exception);
}
+
+static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static inline gfn_t kvm_gfn_direct_bits(const struct kvm *kvm)
+{
+ return kvm->arch.gfn_direct_bits;
+}
+
+static inline bool kvm_is_addr_direct(struct kvm *kvm, gpa_t gpa)
+{
+ gpa_t gpa_direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(kvm));
+
+ return !gpa_direct_bits || (gpa & gpa_direct_bits);
+}
+
+static inline bool kvm_is_gfn_alias(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn & kvm_gfn_direct_bits(kvm);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2401606db260..74c20dbb92da 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -599,6 +599,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
return r;
+ if (kvm_has_mirrored_tdp(vcpu->kvm)) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL);
if (r)
@@ -618,6 +624,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
@@ -3656,8 +3663,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
- if (tdp_mmu_enabled)
- return kvm_tdp_mmu_alloc_root(vcpu);
+ if (tdp_mmu_enabled) {
+ if (kvm_has_mirrored_tdp(vcpu->kvm) &&
+ !VALID_PAGE(mmu->mirror_root_hpa))
+ kvm_tdp_mmu_alloc_root(vcpu, true);
+ kvm_tdp_mmu_alloc_root(vcpu, false);
+ return 0;
+ }
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
@@ -4379,8 +4391,12 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, unsigned int access)
{
struct kvm_memory_slot *slot = fault->slot;
+ struct kvm *kvm = vcpu->kvm;
int ret;
+ if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm))
+ return -EFAULT;
+
/*
* Note that the mmu_invalidate_seq also serves to detect a concurrent
* change in attributes. is_page_fault_stale() will detect an
@@ -4394,7 +4410,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* Now that we have a snapshot of mmu_invalidate_seq we can check for a
* private vs. shared mismatch.
*/
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
@@ -4456,7 +4472,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_mmu_faultin_pfn(vcpu, fault);
@@ -4476,7 +4492,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* overall cost of failing to detect the invalidation until after
* mmu_lock is acquired.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
return RET_PF_RETRY;
}
@@ -5022,7 +5038,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
guest_cpuid_is_amd_compatible(vcpu));
}
@@ -5099,7 +5115,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
is_pse, is_amd);
if (!shadow_me_mask)
@@ -6095,8 +6111,16 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
else if (r == RET_PF_SPURIOUS)
vcpu->stat.pf_spurious++;
+ /*
+ * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or
+ * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE.
+ * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to
+ * indicate continuing the page fault handling until to the final
+ * page table mapping phase.
+ */
+ WARN_ON_ONCE(r == RET_PF_CONTINUE);
if (r != RET_PF_EMULATE)
- return 1;
+ return r;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
@@ -6272,6 +6296,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
+ mmu->mirror_root_hpa = INVALID_PAGE;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -6441,8 +6466,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (tdp_mmu_enabled)
- kvm_tdp_mmu_invalidate_all_roots(kvm);
+ if (tdp_mmu_enabled) {
+ /*
+ * External page tables don't support fast zapping, therefore
+ * their mirrors must be invalidated separately by the caller.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
+ }
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
@@ -6467,7 +6497,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* lead to use-after-free.
*/
if (tdp_mmu_enabled)
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
void kvm_mmu_init_vm(struct kvm *kvm)
@@ -7090,6 +7120,19 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be valid even though the VM is globally visible. Do nothing,
+ * as such a VM can't have any possible NX huge pages.
+ */
+ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
+
+ if (nx_thread)
+ vhost_task_wake(nx_thread);
+}
+
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
@@ -7150,7 +7193,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
}
mutex_unlock(&kvm_lock);
}
@@ -7220,6 +7263,12 @@ out:
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
+ if (tdp_mmu_enabled) {
+ read_lock(&vcpu->kvm->mmu_lock);
+ mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa,
+ NULL);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ }
free_mmu_pages(&vcpu->arch.root_mmu);
free_mmu_pages(&vcpu->arch.guest_mmu);
mmu_free_memory_caches(vcpu);
@@ -7279,7 +7328,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
mutex_unlock(&kvm_lock);
}
@@ -7411,20 +7460,34 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
return true;
}
+static void kvm_mmu_start_lpage_recovery(struct once *once)
+{
+ struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct vhost_task *nx_thread;
+
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
+ kvm_nx_huge_page_recovery_worker_kill,
+ kvm, "kvm-nx-lpage-recovery");
+
+ if (!nx_thread)
+ return;
+
+ vhost_task_start(nx_thread);
+
+ /* Make the task visible only once it is fully started. */
+ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+}
+
int kvm_mmu_post_init_vm(struct kvm *kvm)
{
if (nx_hugepage_mitigation_hard_disabled)
return 0;
- kvm->arch.nx_huge_page_last = get_jiffies_64();
- kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
- kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
- kvm, "kvm-nx-lpage-recovery");
-
+ call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
if (!kvm->arch.nx_huge_page_recovery_thread)
return -ENOMEM;
-
- vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
return 0;
}
@@ -7452,6 +7515,12 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
return kvm_unmap_gfn_range(kvm, range);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b00abbe3f6cf..75f00598289d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
+#include "mmu.h"
+
#ifdef CONFIG_KVM_PROVE_MMU
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
@@ -101,7 +103,22 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
- unsigned int unsync_children;
+ union {
+ /* These two members aren't used for TDP MMU */
+ struct {
+ unsigned int unsync_children;
+ /*
+ * Number of writes since the last time traversal
+ * visited this page.
+ */
+ atomic_t write_flooding_count;
+ };
+ /*
+ * Page table page of external PT.
+ * Passed to TDX module, not accessed by KVM.
+ */
+ void *external_spt;
+ };
union {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
@@ -124,9 +141,6 @@ struct kvm_mmu_page {
int clear_spte_count;
#endif
- /* Number of writes since the last time traversal visited this page. */
- atomic_t write_flooding_count;
-
#ifdef CONFIG_X86_64
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
@@ -145,6 +159,34 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
return kvm_mmu_role_as_id(sp->role);
}
+static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
+{
+ return sp->role.is_mirror;
+}
+
+static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ /*
+ * external_spt is allocated for TDX module to hold private EPT mappings,
+ * TDX module will initialize the page by itself.
+ * Therefore, KVM does not need to initialize or access external_spt.
+ * KVM only interacts with sp->spt for private EPT operations.
+ */
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
+}
+
+static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
+{
+ /*
+ * Since mirror SPs are used only for TDX, which maps private memory
+ * at its "natural" GFN, no mask needs to be applied to them - and, dually,
+ * we expect that the bits is only used for the shared PT.
+ */
+ if (is_mirror_sp(root))
+ return 0;
+ return kvm_gfn_direct_bits(kvm);
+}
+
static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
{
/*
@@ -229,7 +271,12 @@ struct kvm_page_fault {
*/
u8 goal_level;
- /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
gfn_t gfn;
/* The memslot containing gfn. May be NULL. */
@@ -268,9 +315,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
*
* Note, all values must be greater than or equal to zero so as not to encroach
- * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
- * will allow for efficient machine code when checking for CONTINUE, e.g.
- * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
+ * on -errno return values.
*/
enum {
RET_PF_CONTINUE = 0,
@@ -282,6 +327,14 @@ enum {
RET_PF_SPURIOUS,
};
+/*
+ * Define RET_PF_CONTINUE as 0 to allow for
+ * - efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero,
+ * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
+ */
+static_assert(RET_PF_CONTINUE == 0);
+
static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -317,10 +370,19 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int r;
if (vcpu->arch.mmu->root_role.direct) {
- fault.gfn = fault.addr >> PAGE_SHIFT;
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
+ /*
+ * With retpoline being active an indirect call is rather expensive,
+ * so do a direct call in the most common case.
+ */
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index af10bc0380a3..59746854c0af 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -276,6 +276,11 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
+static inline bool is_mirror_sptep(tdp_ptep_t sptep)
+{
+ return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 04c247bfe318..9e17bfa80901 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
- SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+ SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
@@ -37,15 +37,17 @@ void tdp_iter_restart(struct tdp_iter *iter)
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn)
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
{
if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
- (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ (root->role.level > PT64_ROOT_MAX_LEVEL) ||
+ (gfn_bits && next_last_level_gfn >= gfn_bits))) {
iter->valid = false;
return;
}
iter->next_last_level_gfn = next_last_level_gfn;
+ iter->gfn_bits = gfn_bits;
iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
@@ -113,7 +115,7 @@ static bool try_step_side(struct tdp_iter *iter)
* Check if the iterator is already at the end of the current page
* table.
*/
- if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
+ if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
(SPTE_ENT_PER_PAGE - 1))
return false;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 2880fd392e0c..047b78333653 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -93,8 +93,10 @@ struct tdp_iter {
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
- /* The lowest GFN mapped by the current SPTE */
+ /* The lowest GFN (mask bits excluded) mapped by the current SPTE */
gfn_t gfn;
+ /* Mask applied to convert the GFN to the mapping GPA */
+ gfn_t gfn_bits;
/* The level of the root page given to the iterator */
int root_level;
/* The lowest level the iterator should traverse to */
@@ -122,18 +124,23 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
- for (tdp_iter_start(&iter, root, min_level, start); \
- iter.valid && iter.gfn < end; \
+#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
+ iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, start, end) \
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
+ for (tdp_iter_start(&iter, root, min_level, 0, 0); \
+ iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, kvm, root, start, end) \
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn);
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2f15e0e33903..046b6ba31197 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -37,8 +37,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
* ultimately frees all roots.
*/
- kvm_tdp_mmu_invalidate_all_roots(kvm);
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
@@ -53,6 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
+ free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -91,19 +92,33 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
+ return false;
+
+ if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
+ return false;
+
+ if (likely(!is_mirror_sp(root)))
+ return types & KVM_DIRECT_ROOTS;
+ return types & KVM_MIRROR_ROOTS;
+}
+
/*
* Returns the next root after @prev_root (or the first root if @prev_root is
- * NULL). A reference to the returned root is acquired, and the reference to
- * @prev_root is released (the caller obviously must hold a reference to
- * @prev_root if it's non-NULL).
+ * NULL) that matches with @types. A reference to the returned root is
+ * acquired, and the reference to @prev_root is released (the caller obviously
+ * must hold a reference to @prev_root if it's non-NULL).
*
- * If @only_valid is true, invalid roots are skipped.
+ * Roots that doesn't match with @types are skipped.
*
* Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool only_valid)
+ enum kvm_tdp_mmu_root_types types)
{
struct kvm_mmu_page *next_root;
@@ -124,7 +139,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
typeof(*next_root), link);
while (next_root) {
- if ((!only_valid || !next_root->role.invalid) &&
+ if (tdp_mmu_root_match(next_root, types) &&
kvm_tdp_mmu_get_root(next_root))
break;
@@ -149,20 +164,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ _root = tdp_mmu_next_root(_kvm, _root, _types)) \
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, false))
+ _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -171,18 +186,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
- ((_only_valid) && (_root)->role.invalid))) { \
+ !tdp_mmu_root_match((_root), (_types)))) { \
} else
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
-
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
@@ -223,7 +235,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role role = mmu->root_role;
@@ -231,6 +243,9 @@ int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
+ if (mirror)
+ role.is_mirror = true;
+
/*
* Check for an existing root before acquiring the pages lock to avoid
* unnecessary serialization if multiple vCPUs are loading a new root.
@@ -282,9 +297,12 @@ out_read_unlock:
* and actually consuming the root if it's invalidated after dropping
* mmu_lock, and the root can't be freed as this vCPU holds a reference.
*/
- mmu->root.hpa = __pa(root->spt);
- mmu->root.pgd = 0;
- return 0;
+ if (mirror) {
+ mmu->mirror_root_hpa = __pa(root->spt);
+ } else {
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ }
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -322,6 +340,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
+static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ int level)
+{
+ kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
+ int ret;
+
+ /*
+ * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
+ * PTs are removed in a special order, involving free_external_spt().
+ * But remove_external_spte() will be called on non-leaf PTEs via
+ * __tdp_mmu_zap_root(), so avoid the error the former would return
+ * in this case.
+ */
+ if (!is_last_spte(old_spte, level))
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ /* Because write lock is held, operation should success. */
+ ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
+ KVM_BUG_ON(ret, kvm);
+}
+
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -417,11 +458,81 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
old_spte, FROZEN_SPTE, level, shared);
+
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->external_spt = NULL;
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
+{
+ if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ WARN_ON_ONCE(sp->role.level + 1 != level);
+ WARN_ON_ONCE(sp->gfn != gfn);
+ return sp->external_spt;
+ }
+
+ return NULL;
+}
+
+static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
+ gfn_t gfn, u64 old_spte,
+ u64 new_spte, int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
+ int ret = 0;
+
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ /*
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
+ */
+ if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
+ return -EBUSY;
+
+ /*
+ * Use different call to either set up middle level
+ * external page table, or leaf.
+ */
+ if (is_leaf) {
+ ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
+ } else {
+ void *external_spt = get_external_spt(gfn, new_spte, level);
+
+ KVM_BUG_ON(!external_spt, kvm);
+ ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
+ }
+ if (ret)
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return ret;
+}
+
/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
@@ -522,11 +633,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
-static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
u64 new_spte)
{
- u64 *sptep = rcu_dereference(iter->sptep);
-
/*
* The caller is responsible for ensuring the old SPTE is not a FROZEN
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
@@ -535,15 +645,34 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
*/
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
- /*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
- * does not hold the mmu_lock. On failure, i.e. if a different logical
- * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
- * the current value, so the caller operates on fresh data, e.g. if it
- * retries tdp_mmu_set_spte_atomic()
- */
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
- return -EBUSY;
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ int ret;
+
+ /*
+ * Users of atomic zapping don't operate on mirror roots,
+ * so don't handle it and bug the VM if it's seen.
+ */
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EBUSY;
+
+ ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ if (ret)
+ return ret;
+ } else {
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
+ * and does not hold the mmu_lock. On failure, i.e. if a
+ * different logical CPU modified the SPTE, try_cmpxchg64()
+ * updates iter->old_spte with the current value, so the caller
+ * operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+ }
return 0;
}
@@ -573,7 +702,7 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- ret = __tdp_mmu_set_spte_atomic(iter, new_spte);
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
if (ret)
return ret;
@@ -613,6 +742,16 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+
+ /*
+ * Users that do non-atomic setting of PTEs don't operate on mirror
+ * roots, so don't handle it and bug the VM if it's seen.
+ */
+ if (is_mirror_sptep(sptep)) {
+ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+
return old_spte;
}
@@ -625,18 +764,18 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
iter->gfn, iter->level);
}
-#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root, _start, _end)
+#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
-#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
- tdp_root_for_each_pte(_iter, _root, _start, _end) \
+#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
if (!is_shadow_present_pte(_iter.old_spte) || \
!is_last_spte(_iter.old_spte, _iter.level)) \
continue; \
else
-#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
+#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
@@ -705,10 +844,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- gfn_t end = tdp_mmu_max_gfn_exclusive();
- gfn_t start = 0;
-
- for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+ for_each_tdp_pte_min_level_all(iter, root, zap_level) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -812,7 +948,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
@@ -863,19 +999,21 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *root;
/*
- * Zap all roots, including invalid roots, as all SPTEs must be dropped
- * before returning to the caller. Zap directly even if the root is
- * also being zapped by a worker. Walking zapped top-level SPTEs isn't
- * all that expensive and mmu_lock is already held, which means the
- * worker has yielded, i.e. flushing the work instead of zapping here
- * isn't guaranteed to be any faster.
+ * Zap all direct roots, including invalid direct roots, as all direct
+ * SPTEs must be dropped before returning to the caller. For TDX, mirror
+ * roots don't need handling in response to the mmu notifier (the caller).
+ *
+ * Zap directly even if the root is also being zapped by a concurrent
+ * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
+ * and mmu_lock is already held, which means the other thread has yielded.
*
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
+ KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
tdp_mmu_zap_root(kvm, root, false);
}
@@ -883,11 +1021,14 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes.
*/
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
{
struct kvm_mmu_page *root;
- read_lock(&kvm->mmu_lock);
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
@@ -905,7 +1046,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* that may be zapped, as such entries are associated with the
* ASID on both VMX and SVM.
*/
- tdp_mmu_zap_root(kvm, root, true);
+ tdp_mmu_zap_root(kvm, root, shared);
/*
* The referenced needs to be put *after* zapping the root, as
@@ -915,7 +1056,10 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
kvm_tdp_mmu_put_root(kvm, root);
}
- read_unlock(&kvm->mmu_lock);
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
}
/*
@@ -928,11 +1072,19 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
* See kvm_tdp_mmu_alloc_root().
*/
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types)
{
struct kvm_mmu_page *root;
/*
+ * Invalidating invalid roots doesn't make sense, prevent developers from
+ * having to think about it.
+ */
+ if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
+ root_types &= ~KVM_INVALID_ROOTS;
+
+ /*
* mmu_lock must be held for write to ensure that a root doesn't become
* invalid while there are active readers (invalidating a root while
* there are active readers may or may not be problematic in practice,
@@ -953,6 +1105,9 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
* or get/put references to roots.
*/
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!tdp_mmu_root_match(root, root_types))
+ continue;
+
/*
* Note, invalid roots can outlive a memslot update! Invalid
* roots must be *zapped* before the memslot update completes,
@@ -1068,7 +1223,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
*/
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
@@ -1080,7 +1235,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1107,13 +1262,18 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
+ if (is_mirror_sp(sp))
+ kvm_mmu_alloc_external_spt(vcpu, sp);
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- if (is_shadow_present_pte(iter.old_spte))
+ if (is_shadow_present_pte(iter.old_spte)) {
+ /* Don't support large page for mirrored roots (TDX) */
+ KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
- else
+ } else {
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ }
/*
* Force the guest to retry if installing an upper level SPTE
@@ -1148,12 +1308,16 @@ retry:
return ret;
}
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
- __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
@@ -1193,20 +1357,24 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
bool test_only)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
struct tdp_iter iter;
bool ret = false;
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
+
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code. Note,
* this helper must NOT be used to unmap GFNs, as it processes only
* valid roots!
*/
- for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ WARN_ON(types & ~KVM_VALID_ROOTS);
+ __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
guard(rcu)();
- tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@@ -1247,7 +1415,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1366,7 +1534,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
* level above the target level (e.g. splitting a 1GB to 512 2MB pages,
* and then splitting each of those to 512 4KB pages).
*/
- for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -1464,7 +1632,7 @@ static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ tdp_root_for_each_pte(iter, kvm, root, start, end) {
retry:
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1512,7 +1680,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
+ tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
break;
@@ -1566,7 +1734,7 @@ static int tdp_mmu_make_huge_spte(struct kvm *kvm,
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
struct tdp_iter iter;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
/*
* Use the parent iterator when checking for forward progress so
* that KVM doesn't get stuck continuously trying to yield (i.e.
@@ -1600,7 +1768,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
@@ -1681,7 +1849,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -1729,14 +1897,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
*root_level = vcpu->arch.mmu->root_role.level;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1758,11 +1926,12 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
+ /* Fast pf is not supported for mirrored roots */
+ struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
tdp_ptep_t sptep = NULL;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index f03ca0dd13d9..52acf99d40a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
@@ -19,11 +19,56 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+enum kvm_tdp_mmu_root_types {
+ KVM_INVALID_ROOTS = BIT(0),
+ KVM_DIRECT_ROOTS = BIT(1),
+ KVM_MIRROR_ROOTS = BIT(2),
+ KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS,
+ KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS,
+};
+
+static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm,
+ enum kvm_gfn_range_filter process)
+{
+ enum kvm_tdp_mmu_root_types ret = 0;
+
+ if (!kvm_has_mirrored_tdp(kvm))
+ return KVM_DIRECT_ROOTS;
+
+ if (process & KVM_FILTER_PRIVATE)
+ ret |= KVM_MIRROR_ROOTS;
+ if (process & KVM_FILTER_SHARED)
+ ret |= KVM_DIRECT_ROOTS;
+
+ WARN_ON_ONCE(!ret);
+
+ return ret;
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
+ enum kvm_tdp_mmu_root_types type)
+{
+ if (unlikely(type == KVM_MIRROR_ROOTS))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 47a46283c866..75e9cfc689f8 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -797,7 +797,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
memset(pmu, 0, sizeof(*pmu));
kvm_pmu_call(init)(vcpu);
- kvm_pmu_refresh(vcpu);
}
/* Release perf_events for vPMCs that have been unused for a full time slice. */
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index e46220ece83c..fde0ae986003 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -7,23 +7,6 @@
#include <asm/cpufeatures.h>
/*
- * Hardware-defined CPUID leafs that are either scattered by the kernel or are
- * unknown to the kernel, but need to be directly used by KVM. Note, these
- * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
- */
-enum kvm_only_cpuid_leafs {
- CPUID_12_EAX = NCAPINTS,
- CPUID_7_1_EDX,
- CPUID_8000_0007_EDX,
- CPUID_8000_0022_EAX,
- CPUID_7_2_EDX,
- CPUID_24_0_EBX,
- NR_KVM_CPU_CAPS,
-
- NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
-};
-
-/*
* Define a KVM-only feature flag.
*
* For features that are scattered by cpufeatures.h, __feature_translate() also
@@ -145,7 +128,10 @@ static __always_inline u32 __feature_translate(int x86_feature)
static __always_inline u32 __feature_leaf(int x86_feature)
{
- return __feature_translate(x86_feature) / 32;
+ u32 x86_leaf = __feature_translate(x86_feature) / 32;
+
+ reverse_cpuid_check(x86_leaf);
+ return x86_leaf;
}
/*
@@ -168,7 +154,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_featu
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
- reverse_cpuid_check(x86_leaf);
return reverse_cpuid[x86_leaf];
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 85241c0c7f56..e0ab7df27b66 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -283,7 +283,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
memset(smram.bytes, 0, sizeof(smram.bytes));
#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, &smram.smram64);
else
#endif
@@ -353,7 +353,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
if (kvm_x86_call(set_efer)(vcpu, 0))
goto error;
#endif
@@ -586,7 +586,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
* supports long mode.
*/
#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
struct kvm_segment cs_desc;
unsigned long cr4;
@@ -609,7 +609,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
unsigned long cr4, efer;
/* Clear CR4.PAE before clearing EFER.LME. */
@@ -634,7 +634,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
ret = rsm_load_state_64(ctxt, &smram.smram64);
else
#endif
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index b708bdf7eaff..d77b094d9a4d 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -111,7 +111,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
{
- if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
+ if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
return true;
if (!nested_npt_enabled(svm))
@@ -594,7 +594,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
vmcb_mark_dirty(vmcb02, VMCB_DR);
}
- if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
/*
* Reserved bits of DEBUGCTL are ignored. Be consistent with
@@ -651,7 +651,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
*/
- if (guest_can_use(vcpu, X86_FEATURE_VGIF) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
(svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
else
@@ -689,7 +689,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
- if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
nested_svm_update_tsc_ratio_msr(vcpu);
@@ -710,7 +710,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* what a nrips=0 CPU would do (L1 is responsible for advancing RIP
* prior to injecting the event).
*/
- if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
else if (boot_cpu_has(X86_FEATURE_NRIPS))
vmcb02->control.next_rip = vmcb12_rip;
@@ -720,7 +720,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->soft_int_injected = true;
svm->soft_int_csbase = vmcb12_csbase;
svm->soft_int_old_rip = vmcb12_rip;
- if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
else
svm->soft_int_next_rip = vmcb12_rip;
@@ -728,18 +728,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.virt_ext = vmcb01->control.virt_ext &
LBR_CTL_ENABLE_MASK;
- if (guest_can_use(vcpu, X86_FEATURE_LBRV))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV))
vmcb02->control.virt_ext |=
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER))
pause_count12 = svm->nested.ctl.pause_filter_count;
else
pause_count12 = 0;
- if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD))
pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
else
pause_thresh12 = 0;
@@ -1026,7 +1026,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
nested_save_pending_event_to_vmcb12(svm, vmcb12);
- if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
@@ -1065,7 +1065,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_exit_on_intr(svm))
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
svm_copy_lbrs(vmcb12, vmcb02);
svm_update_lbrv(vcpu);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 22d5a65b410c..288f7f2a46f2 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -46,7 +46,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
switch (msr) {
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE))
return NULL;
/*
* Each PMU counter has a pair of CTL and CTR MSRs. CTLn
@@ -109,7 +109,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
return pmu->version > 0;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
- return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE);
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE);
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
@@ -179,7 +179,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
union cpuid_0x80000022_ebx ebx;
pmu->version = 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) {
pmu->version = 2;
/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
@@ -189,7 +189,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
- } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
} else {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 943bd074a5d3..a2a794c32050 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3051,11 +3051,11 @@ out:
min_sev_asid, max_sev_asid);
if (boot_cpu_has(X86_FEATURE_SEV_ES))
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
- sev_es_supported ? "enabled" : "disabled",
+ str_enabled_disabled(sev_es_supported),
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
if (boot_cpu_has(X86_FEATURE_SEV_SNP))
pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
- sev_snp_supported ? "enabled" : "disabled",
+ str_enabled_disabled(sev_snp_supported),
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
sev_enabled = sev_supported;
@@ -3627,13 +3627,20 @@ static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
return 1; /* resume guest */
}
- if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
return 1; /* resume guest */
}
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = 1;
vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
@@ -3710,7 +3717,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
bool huge;
u64 gfn;
- if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
return 1;
}
@@ -3797,6 +3804,13 @@ next_range:
case VMGEXIT_PSC_OP_SHARED:
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
@@ -3820,7 +3834,7 @@ next_range:
goto next_range;
}
- unreachable();
+ BUG();
}
static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
@@ -4435,8 +4449,8 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
}
@@ -4445,16 +4459,15 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
* the host/guest supports its use.
*
- * guest_can_use() checks a number of requirements on the host/guest to
- * ensure that MSR_IA32_XSS is available, but it might report true even
- * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host
- * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better
- * to further check that the guest CPUID actually supports
- * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved
- * guests will still get intercepted and caught in the normal
- * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths.
+ * KVM treats the guest as being capable of using XSAVES even if XSAVES
+ * isn't enabled in guest CPUID as there is no intercept for XSAVES,
+ * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
+ * exposed to the guest and XSAVES is supported in hardware. Condition
+ * full XSS passthrough on the guest being able to use XSAVES *and*
+ * XSAVES being exposed to the guest so that KVM can at least honor
+ * guest CPUID for RDMSR and WRMSR.
*/
- if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
else
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 21dacd312779..7640a84e554a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -28,6 +28,7 @@
#include <linux/rwsem.h>
#include <linux/cc_platform.h>
#include <linux/smp.h>
+#include <linux/string_choices.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -284,8 +285,6 @@ u32 svm_msrpm_offset(u32 msr)
return MSR_INVALID;
}
-static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -1049,7 +1048,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
- (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
if (enable_lbrv == current_enable_lbrv)
@@ -1187,14 +1186,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
*/
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
if (!npt_enabled ||
- !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
+ !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
svm_set_intercept(svm, INTERCEPT_INVPCID);
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
}
if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
- if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
svm_clr_intercept(svm, INTERCEPT_RDTSCP);
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
@@ -1921,9 +1920,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = vcpu->arch.cr4;
- if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb_current(vcpu);
-
vcpu->arch.cr4 = cr4;
if (!npt_enabled) {
cr4 |= X86_CR4_PAE;
@@ -2864,7 +2860,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
if (!msr_info->host_initiated &&
- !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
return 1;
msr_info->data = svm->tsc_ratio_msr;
break;
@@ -2940,7 +2936,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
return 1;
msr_info->data = svm->virt_spec_ctrl;
@@ -3024,7 +3020,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {
if (!msr->host_initiated)
return 1;
@@ -3046,7 +3042,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_ratio_msr = data;
- if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
is_guest_mode(vcpu))
nested_svm_update_tsc_ratio_msr(vcpu);
@@ -3091,7 +3087,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
return 1;
if (data & ~SPEC_CTRL_SSBD)
@@ -3263,7 +3259,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
unsigned long type;
gva_t gva;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -3533,6 +3529,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
*error_code = 0;
}
+static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
+ u32 *error_code)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *intr_info = control->event_inj;
+
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->event_inj_err;
+ else
+ *error_code = 0;
+
+}
+
static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4392,27 +4403,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
* the guest read/write access to the host's XSS.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVES) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
- kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
+ guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES,
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE));
/*
* Intercept VMLOAD if the vCPU model is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
- if (!guest_cpuid_is_intel_compatible(vcpu))
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
svm_recalc_instruction_intercepts(vcpu, svm);
@@ -4422,7 +4423,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
- !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+ !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
if (sev_guest(vcpu->kvm))
sev_vcpu_after_set_cpuid(svm);
@@ -4673,7 +4674,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
* responsible for ensuring nested SVM and SMIs are mutually exclusive.
*/
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return 1;
smram->smram64.svm_guest_flag = 1;
@@ -4720,14 +4721,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
const struct kvm_smram_state_64 *smram64 = &smram->smram64;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return 0;
/* Non-zero if SMI arrived while vCPU was in guest mode. */
if (!smram64->svm_guest_flag)
return 0;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
return 1;
if (!(smram64->efer & EFER_SVME))
@@ -4790,9 +4791,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
bool smep, smap, is_user;
u64 error_code;
+ /* Check that emulation is possible during event vectoring */
+ if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
/* Emulation is always possible when KVM has access to all guest state. */
if (!sev_guest(vcpu->kvm))
return X86EMUL_CONTINUE;
@@ -4889,7 +4896,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* In addition, don't apply the erratum workaround if the #NPF occurred
* while translating guest page tables (see below).
*/
- error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ error_code = svm->vmcb->control.exit_info_1;
if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
goto resume_guest;
@@ -5077,6 +5084,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
.get_exit_info = svm_get_exit_info,
+ .get_entry_info = svm_get_entry_info,
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
@@ -5328,7 +5336,7 @@ static __init int svm_hardware_setup(void)
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled));
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 43fa6a16eb19..9d7cdb8fbf87 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -358,39 +358,32 @@ static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
return &to_kvm_svm(kvm)->sev_info;
}
+#ifdef CONFIG_KVM_AMD_SEV
static __always_inline bool sev_guest(struct kvm *kvm)
{
-#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->active;
-#else
- return false;
-#endif
}
-
static __always_inline bool sev_es_guest(struct kvm *kvm)
{
-#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->es_active && !WARN_ON_ONCE(!sev->active);
-#else
- return false;
-#endif
}
static __always_inline bool sev_snp_guest(struct kvm *kvm)
{
-#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
!WARN_ON_ONCE(!sev_es_guest(kvm));
+}
#else
- return false;
+#define sev_guest(kvm) false
+#define sev_es_guest(kvm) false
+#define sev_snp_guest(kvm) false
#endif
-}
static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
{
@@ -502,7 +495,7 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
- return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
+ return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) &&
(svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
}
@@ -554,7 +547,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
{
- return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) &&
+ return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) &&
(svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
}
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index d3aeffd6ae75..0b844cb97978 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -22,15 +22,22 @@ TRACE_EVENT(kvm_entry,
__field( unsigned int, vcpu_id )
__field( unsigned long, rip )
__field( bool, immediate_exit )
+ __field( u32, intr_info )
+ __field( u32, error_code )
),
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
__entry->immediate_exit = force_immediate_exit;
+
+ kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info,
+ &__entry->error_code);
),
- TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+ TP_printk("vcpu %u, rip 0x%lx intr_info 0x%08x error_code 0x%08x%s",
+ __entry->vcpu_id, __entry->rip,
+ __entry->intr_info, __entry->error_code,
__entry->immediate_exit ? "[immediate exit]" : "")
);
@@ -308,12 +315,14 @@ TRACE_EVENT(name, \
__field( u32, intr_info ) \
__field( u32, error_code ) \
__field( unsigned int, vcpu_id ) \
+ __field( u64, requests ) \
), \
\
TP_fast_assign( \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
+ __entry->requests = READ_ONCE(vcpu->requests); \
kvm_x86_call(get_exit_info)(vcpu, \
&__entry->exit_reason, \
&__entry->info1, \
@@ -323,11 +332,13 @@ TRACE_EVENT(name, \
), \
\
TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
- "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \
+ "info2 0x%016llx intr_info 0x%08x error_code 0x%08x " \
+ "requests 0x%016llx", \
__entry->vcpu_id, \
kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
__entry->guest_rip, __entry->info1, __entry->info2, \
- __entry->intr_info, __entry->error_code) \
+ __entry->intr_info, __entry->error_code, \
+ __entry->requests) \
)
/*
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index a87407412615..11a339009781 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -42,7 +42,7 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
return vmx->nested.hv_evmcs;
}
-static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
+static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu)
{
/*
* eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
index a543fccfc574..6536290f4274 100644
--- a/arch/x86/kvm/vmx/hyperv_evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -6,7 +6,7 @@
#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H
#define __KVM_X86_VMX_HYPERV_EVMCS_H
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include "capabilities.h"
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 92d35cc6cd15..2427f918e763 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -100,7 +100,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_pre_state_restore = vmx_apicv_pre_state_restore,
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_interrupt = vmx_deliver_interrupt,
@@ -111,6 +110,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
+ .get_entry_info = vmx_get_entry_info,
.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
@@ -126,7 +126,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .cpu_dirty_log_size = PML_ENTITY_NUM,
+ .cpu_dirty_log_size = PML_LOG_NR_ENTRIES,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
.nested_ops = &vmx_nested_ops,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index aa78b6f38dfe..8a7af02d466e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -257,7 +257,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
* state. It is possible that the area will stay mapped as
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
- if (!guest_cpuid_has_evmcs(vcpu) ||
+ if (!guest_cpu_cap_has_evmcs(vcpu) ||
!evmptr_is_valid(nested_get_evmptr(vcpu)))
return false;
@@ -2089,7 +2089,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
- if (likely(!guest_cpuid_has_evmcs(vcpu)))
+ if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
evmcs_gpa = nested_get_evmptr(vcpu);
@@ -2992,7 +2992,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return -EINVAL;
#ifdef CONFIG_KVM_HYPERV
- if (guest_cpuid_has_evmcs(vcpu))
+ if (guest_cpu_cap_has_evmcs(vcpu))
return nested_evmcs_check_controls(vmcs12);
#endif
@@ -3287,7 +3287,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (guest_cpuid_has_evmcs(vcpu) &&
+ if (guest_cpu_cap_has_evmcs(vcpu) &&
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
@@ -3442,7 +3442,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
if (!nested_cpu_has_pml(vmcs12))
return 0;
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
vmx->nested.pml_full = true;
return 1;
}
@@ -3481,14 +3481,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
-static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
-{
- u8 rvi = vmx_get_rvi();
- u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
@@ -3508,7 +3500,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
enum vm_entry_failure_code entry_failure_code;
- bool evaluate_pending_interrupts;
union vmx_exit_reason exit_reason = {
.basic = EXIT_REASON_INVALID_STATE,
.failed_vmentry = 1,
@@ -3527,13 +3518,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
kvm_service_local_tlb_flush_requests(vcpu);
- evaluate_pending_interrupts = exec_controls_get(vmx) &
- (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
- if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
- evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
- if (!evaluate_pending_interrupts)
- evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
-
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
@@ -3616,9 +3600,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
* Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
* when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
* effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
- * unconditionally.
+ * unconditionally. Take care to pull data from vmcs01 as appropriate,
+ * e.g. when checking for interrupt windows, as vmcs02 is now loaded.
*/
- if (unlikely(evaluate_pending_interrupts))
+ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING)) ||
+ kvm_apic_has_pending_init_or_sipi(vcpu) ||
+ kvm_apic_has_interrupt(vcpu))
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
@@ -3751,14 +3739,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
goto vmentry_failed;
- /* Emulate processing of posted interrupts on VM-Enter. */
- if (nested_cpu_has_posted_intr(vmcs12) &&
- kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
- vmx->nested.pi_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
- }
-
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -4220,13 +4200,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
*/
bool block_nested_exceptions = vmx->nested.nested_run_pending;
/*
- * New events (not exceptions) are only recognized at instruction
+ * Events that don't require injection, i.e. that are virtualized by
+ * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
+ * to regain control in order to deliver the event, and hardware will
+ * handle event ordering, e.g. with respect to injected exceptions.
+ *
+ * But, new events (not exceptions) are only recognized at instruction
* boundaries. If an event needs reinjection, then KVM is handling a
- * VM-Exit that occurred _during_ instruction execution; new events are
- * blocked until the instruction completes.
+ * VM-Exit that occurred _during_ instruction execution; new events,
+ * irrespective of whether or not they're injected, are blocked until
+ * the instruction completes.
+ */
+ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
+ /*
+ * Inject events are blocked by nested VM-Enter, as KVM is responsible
+ * for managing priority between concurrent events, i.e. KVM needs to
+ * wait until after VM-Enter completes to deliver injected events.
*/
bool block_nested_events = block_nested_exceptions ||
- kvm_event_needs_reinjection(vcpu);
+ block_non_injected_events;
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@ -4338,18 +4330,26 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
int irq;
- if (block_nested_events)
- return -EBUSY;
- if (!nested_exit_on_intr(vcpu))
+ if (!nested_exit_on_intr(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
goto no_vmexit;
+ }
if (!nested_exit_intr_ack_set(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
return 0;
}
irq = kvm_cpu_get_extint(vcpu);
if (irq != -1) {
+ if (block_nested_events)
+ return -EBUSY;
+
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
return 0;
@@ -4368,11 +4368,22 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
* and enabling posted interrupts requires ACK-on-exit.
*/
if (irq == vmx->nested.posted_intr_nv) {
+ /*
+ * Nested posted interrupts are delivered via RVI, i.e.
+ * aren't injected by KVM, and so can be queued even if
+ * manual event injection is disallowed.
+ */
+ if (block_non_injected_events)
+ return -EBUSY;
+
vmx->nested.pi_pending = true;
kvm_apic_clear_irr(vcpu, irq);
goto no_vmexit;
}
+ if (block_nested_events)
+ return -EBUSY;
+
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
@@ -5015,7 +5026,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
* doesn't isolate different VMCSs, i.e. in this case, doesn't provide
* separate modes for L2 vs L1.
*/
- if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL))
indirect_branch_prediction_barrier();
/* Update any VMCS fields that might have changed while L2 ran */
@@ -5050,6 +5061,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
+ if (vmx->nested.update_vmcs01_hwapic_isr) {
+ vmx->nested.update_vmcs01_hwapic_isr = false;
+ kvm_apic_update_hwapic_isr(vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
@@ -6279,7 +6295,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
{
u32 encls_leaf;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
return false;
@@ -6617,7 +6633,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
- if (guest_can_use(vcpu, X86_FEATURE_VMX) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
@@ -6758,7 +6774,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
return -EINVAL;
} else {
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return -EINVAL;
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
@@ -6792,7 +6808,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
- (!guest_can_use(vcpu, X86_FEATURE_VMX) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
!vmx->nested.enlightened_vmcs_enabled))
return -EINVAL;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9c9d4a336166..77012b2eca0e 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -110,7 +110,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return 0;
return vcpu->arch.perf_capabilities;
@@ -160,7 +160,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
break;
case MSR_IA32_DS_AREA:
- ret = guest_cpuid_has(vcpu, X86_FEATURE_DS);
+ ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS);
break;
case MSR_PEBS_DATA_CFG:
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index b352a3ba7354..9961e07cf071 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -122,7 +122,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
* likely than a bad userspace address.
*/
if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
- guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) {
memset(&ex, 0, sizeof(ex));
ex.vector = PF_VECTOR;
ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
@@ -365,7 +365,7 @@ static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
return true;
if (leaf >= EAUG && leaf <= EMODT)
- return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2);
return false;
}
@@ -381,8 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
{
u32 leaf = (u32)kvm_rax_read(vcpu);
- if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
kvm_queue_exception(vcpu, UD_VECTOR);
} else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
!sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
@@ -479,15 +479,15 @@ void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (!cpu_has_vmx_encls_vmexit())
return;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) &&
sgx_enabled_in_guest_bios(vcpu)) {
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
if (sgx_intercept_encls_ecreate(vcpu))
bitmap |= (1 << ECREATE);
}
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
bitmap &= ~GENMASK_ULL(EMODT, EAUG);
/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 893366e53732..f72835e85b6d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1636,7 +1636,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
return 1;
/*
@@ -1705,6 +1706,12 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
kvm_queue_exception(vcpu, UD_VECTOR);
return X86EMUL_PROPAGATE_FAULT;
}
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
return X86EMUL_CONTINUE;
}
@@ -1908,8 +1915,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
/*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
@@ -2062,7 +2069,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -2078,13 +2085,13 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
@@ -2097,7 +2104,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* sanity checking and refuse to boot. Filter all unsupported
* features out.
*/
- if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
#endif
@@ -2167,7 +2174,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
u64 data)
{
#ifdef CONFIG_X86_64
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return (u32)data;
#endif
return (unsigned long)data;
@@ -2178,7 +2185,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
u64 debugctl = 0;
if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
@@ -2282,7 +2289,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
@@ -2384,7 +2391,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* behavior, but it's close enough.
*/
if (!msr_info->host_initiated &&
- (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
return 1;
@@ -2394,7 +2401,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
@@ -2468,9 +2475,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data & PERF_CAP_PEBS_MASK) !=
(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
@@ -4590,10 +4597,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
bool __enabled; \
\
if (cpu_has_vmx_##name()) { \
- if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
- __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
- else \
- __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
__enabled, exiting); \
} \
@@ -4669,8 +4673,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
*/
if (cpu_has_vmx_rdtscp()) {
bool rdpid_or_rdtscp_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_ENABLE_RDTSCP,
@@ -4820,7 +4824,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
vmx_write_encls_bitmap(&vmx->vcpu, NULL);
@@ -5959,7 +5963,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
} operand;
int gpr_index;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6049,7 +6053,7 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
/*
* When nested=0, all VMX instruction VM Exits filter here. The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
@@ -6191,6 +6195,15 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
}
}
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
+}
+
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
if (vmx->pml_pg) {
@@ -6202,32 +6215,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
u64 *pml_buf;
- u16 pml_idx;
+ int i;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
- if (pml_idx == (PML_ENTITY_NUM - 1))
+ if (pml_idx == PML_HEAD_INDEX)
return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /* PML index always points to next available PML buffer entity */
- if (pml_idx >= PML_ENTITY_NUM)
- pml_idx = 0;
- else
- pml_idx++;
-
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
pml_buf = page_address(vmx->pml_pg);
- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
- gpa = pml_buf[pml_idx];
+ gpa = pml_buf[i];
WARN_ON(gpa & (PAGE_SIZE - 1));
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
static void vmx_dump_sel(char *name, uint32_t sel)
@@ -6543,33 +6564,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- /*
- * Note:
- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
- * delivery event since it indicates guest is accessing MMIO.
- * The vm-exit can be triggered again after return to guest that
- * will cause infinite loop.
- */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
- exit_reason.basic != EXIT_REASON_NOTIFY)) {
- int ndata = 3;
-
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu);
- if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.data[ndata++] =
- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- }
- vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
- vcpu->run->internal.ndata = ndata;
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
return 0;
}
@@ -6862,11 +6865,32 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
read_unlock(&vcpu->kvm->mmu_lock);
}
-void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -6896,20 +6920,6 @@ static void vmx_set_rvi(int vector)
}
}
-void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
- /*
- * When running L2, updating RVI is only relevant when
- * vmcs12 virtual-interrupt-delivery enabled.
- * However, it can be enabled only when L1 also
- * intercepts external-interrupts and in that case
- * we should not update vmcs02 RVI but instead intercept
- * interrupt. Therefore, do nothing when running L2.
- */
- if (!is_guest_mode(vcpu))
- vmx_set_rvi(max_irr);
-}
-
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7828,12 +7838,8 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
* set if and only if XSAVE is supported.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
@@ -7841,7 +7847,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@ -7850,25 +7856,25 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
struct vmx_uret_msr *msr;
msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
- bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
if (kvm_cpu_cap_has(X86_FEATURE_XFD))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
if (boot_cpu_has(X86_FEATURE_IBPB))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
@@ -7876,17 +7882,17 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
else
vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_SGX_LC_ENABLED;
else
@@ -8597,7 +8603,7 @@ static void __vmx_exit(void)
vmx_cleanup_l1d_flush();
}
-static void vmx_exit(void)
+static void __exit vmx_exit(void)
{
kvm_exit();
__vmx_exit();
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 43f573f6ca46..8b111ce1087c 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -176,6 +176,7 @@ struct nested_vmx {
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
bool update_vmcs01_apicv_status;
+ bool update_vmcs01_hwapic_isr;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -330,7 +331,10 @@ struct vcpu_vmx {
bool ple_window_dirty;
/* Support for PML */
-#define PML_ENTITY_NUM 512
+#define PML_LOG_NR_ENTRIES 512
+ /* PML is written backwards: this is the first entry written by the CPU */
+#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
+
struct page *pml_pg;
/* apic deadline value in host tsc */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
index bba24ed99ee6..cdf8cbb69209 100644
--- a/arch/x86/kvm/vmx/vmx_onhyperv.h
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -3,7 +3,7 @@
#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__
#define __ARCH_X86_KVM_VMX_ONHYPERV_H__
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <linux/jump_label.h>
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index a55981c5216e..ce3295a67c04 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -47,8 +47,7 @@ bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
void vmx_migrate_timers(struct kvm_vcpu *vcpu);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
-void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
-void vmx_hwapic_isr_update(int max_isr);
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
@@ -104,8 +103,11 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
+
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c79a8cc57ba4..8e77e61d4fbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -119,8 +119,6 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
-static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
-
#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
@@ -1179,7 +1177,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != kvm_host.xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1188,7 +1186,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
vcpu->arch.pkru != vcpu->arch.host_pkru &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
- write_pkru(vcpu->arch.pkru);
+ wrpkru(vcpu->arch.pkru);
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@@ -1202,7 +1200,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
- write_pkru(vcpu->arch.host_pkru);
+ wrpkru(vcpu->arch.host_pkru);
}
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
@@ -1210,7 +1208,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
- if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != kvm_host.xss)
wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}
@@ -1283,18 +1281,6 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
-bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- if (cr4 & cr4_reserved_bits)
- return false;
-
- if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
- return false;
-
- return true;
-}
-EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
-
static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
return __kvm_is_valid_cr4(vcpu, cr4) &&
@@ -1516,10 +1502,10 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
u64 fixed = DR6_FIXED_1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
fixed |= DR6_BUS_LOCK;
return fixed;
}
@@ -1695,20 +1681,20 @@ static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
+ if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS))
return false;
- if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
+ if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT))
return false;
- if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
return false;
if (efer & (EFER_LME | EFER_LMA) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return false;
- if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
+ if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX))
return false;
return true;
@@ -1850,8 +1836,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
return 1;
if (!host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
return 1;
/*
@@ -1908,8 +1894,8 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
return 1;
if (!host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
return 1;
break;
}
@@ -2095,7 +2081,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
{
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT))
return kvm_handle_invalid_op(vcpu);
pr_warn_once("%s instruction emulated as NOP!\n", insn);
@@ -3767,13 +3753,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_ARCH_CAPABILITIES:
if (!msr_info->host_initiated ||
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
return KVM_MSR_RET_UNSUPPORTED;
vcpu->arch.arch_capabilities = data;
break;
case MSR_IA32_PERF_CAPABILITIES:
if (!msr_info->host_initiated ||
- !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return KVM_MSR_RET_UNSUPPORTED;
if (data & ~kvm_caps.supported_perf_cap)
@@ -3797,11 +3783,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((!guest_has_pred_cmd_msr(vcpu)))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB))
reserved_bits |= PRED_CMD_IBPB;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB))
reserved_bits |= PRED_CMD_SBPB;
}
@@ -3822,7 +3808,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
case MSR_IA32_FLUSH_CMD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D))
return 1;
if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
@@ -3873,7 +3859,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
- if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
@@ -3900,7 +3886,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
kvm_update_cpuid_runtime(vcpu);
@@ -4077,12 +4063,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.length = data;
break;
case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.status = data;
break;
@@ -4101,7 +4087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#ifdef CONFIG_X86_64
case MSR_IA32_XFD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
if (data & ~kvm_guest_supported_xfd(vcpu))
@@ -4111,7 +4097,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_XFD_ERR:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
if (data & ~kvm_guest_supported_xfd(vcpu))
@@ -4226,12 +4212,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.microcode_version;
break;
case MSR_IA32_ARCH_CAPABILITIES:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.arch_capabilities;
break;
case MSR_IA32_PERF_CAPABILITIES:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.perf_capabilities;
break;
@@ -4432,12 +4418,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.status;
break;
@@ -4456,14 +4442,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#ifdef CONFIG_X86_64
case MSR_IA32_XFD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
break;
case MSR_IA32_XFD_ERR:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
msr_info->data = vcpu->arch.guest_fpu.xfd_err;
@@ -4545,6 +4531,20 @@ static inline bool kvm_can_mwait_in_guest(void)
boot_cpu_has(X86_FEATURE_ARAT);
}
+static u64 kvm_get_allowed_disable_exits(void)
+{
+ u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
+ return r;
+}
+
#ifdef CONFIG_KVM_HYPERV
static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 __user *cpuid_arg)
@@ -4687,15 +4687,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r = KVM_X86_DISABLE_EXITS_PAUSE;
-
- if (!mitigate_smt_rsb) {
- r |= KVM_X86_DISABLE_EXITS_HLT |
- KVM_X86_DISABLE_EXITS_CSTATE;
-
- if (kvm_can_mwait_in_guest())
- r |= KVM_X86_DISABLE_EXITS_MWAIT;
- }
+ r = kvm_get_allowed_disable_exits();
break;
case KVM_CAP_X86_SMM:
if (!IS_ENABLED(CONFIG_KVM_SMM))
@@ -5822,9 +5814,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
- if (vcpu->arch.pv_cpuid.enforce)
- kvm_update_pv_runtime(vcpu);
-
return 0;
default:
return -EINVAL;
@@ -6542,30 +6531,32 @@ split_irqchip_unlock:
break;
case KVM_CAP_X86_DISABLE_EXITS:
r = -EINVAL;
- if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ if (cap->args[0] & ~kvm_get_allowed_disable_exits())
break;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
- kvm->arch.pause_in_guest = true;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ goto disable_exits_unlock;
#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
"KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
- if (!mitigate_smt_rsb) {
- if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
- (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
- pr_warn_once(SMT_RSB_MSG);
-
- if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
- kvm_can_mwait_in_guest())
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
- }
+ if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
+ cpu_smt_possible() &&
+ (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ pr_warn_once(SMT_RSB_MSG);
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
+ kvm->arch.pause_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT)
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
r = 0;
+disable_exits_unlock:
+ mutex_unlock(&kvm->lock);
break;
case KVM_CAP_MSR_PLATFORM_INFO:
kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
@@ -8511,17 +8502,17 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
}
static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
}
static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
@@ -8813,6 +8804,28 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
+void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ u32 reason, intr_info, error_code;
+ struct kvm_run *run = vcpu->run;
+ u64 info1, info2;
+ int ndata = 0;
+
+ kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2,
+ &intr_info, &error_code);
+
+ run->internal.data[ndata++] = info2;
+ run->internal.data[ndata++] = reason;
+ run->internal.data[ndata++] = info1;
+ run->internal.data[ndata++] = gpa;
+ run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ run->internal.ndata = ndata;
+}
+EXPORT_SYMBOL_GPL(kvm_prepare_event_vectoring_exit);
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
struct kvm *kvm = vcpu->kvm;
@@ -9085,6 +9098,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
return 1;
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
+ return 1;
+
+ if (r == X86EMUL_UNHANDLEABLE_VECTORING) {
+ kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa);
+ return 0;
+ }
+
WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
return handle_emulation_failure(vcpu, emulation_type);
}
@@ -9773,10 +9795,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
if (kvm_caps.has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
@@ -9979,17 +9997,19 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
if (!is_64_bit_hypercall(vcpu))
ret = (u32)ret;
kvm_rax_write(vcpu, ret);
- ++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl)
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl,
+ int (*complete_hypercall)(struct kvm_vcpu *))
{
unsigned long ret;
+ ++vcpu->stat.hypercalls;
+
trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!op_64_bit) {
@@ -10041,7 +10061,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
u64 gpa = a0, npages = a1, attrs = a2;
ret = -KVM_ENOSYS;
- if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
break;
if (!PAGE_ALIGNED(gpa) || !npages ||
@@ -10052,6 +10072,13 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = attrs;
@@ -10060,8 +10087,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
- vcpu->arch.complete_userspace_io = complete_hypercall_exit;
- /* stat is incremented on completion. */
+ vcpu->arch.complete_userspace_io = complete_hypercall;
return 0;
}
default:
@@ -10070,41 +10096,23 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
}
out:
- ++vcpu->stat.hypercalls;
- return ret;
+ vcpu->run->hypercall.ret = ret;
+ return 1;
}
-EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
- unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit;
- int cpl;
-
if (kvm_xen_hypercall_enabled(vcpu->kvm))
return kvm_xen_hypercall(vcpu);
if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- nr = kvm_rax_read(vcpu);
- a0 = kvm_rbx_read(vcpu);
- a1 = kvm_rcx_read(vcpu);
- a2 = kvm_rdx_read(vcpu);
- a3 = kvm_rsi_read(vcpu);
- op_64_bit = is_64_bit_hypercall(vcpu);
- cpl = kvm_x86_call(get_cpl)(vcpu);
-
- ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
- if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
- /* MAP_GPA tosses the request to the user space. */
- return 0;
-
- if (!op_64_bit)
- ret = (u32)ret;
- kvm_rax_write(vcpu, ret);
-
- return kvm_skip_emulated_instruction(vcpu);
+ return __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
+ is_64_bit_hypercall(vcpu),
+ kvm_x86_call(get_cpl)(vcpu),
+ complete_hypercall_exit);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -11463,6 +11471,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
int r;
+ r = kvm_mmu_post_init_vm(vcpu->kvm);
+ if (r)
+ return r;
+
vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
kvm_run->flags = 0;
@@ -12276,9 +12288,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_emulate_ctxt;
}
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
-
kvm_async_pf_hash_reset(vcpu);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
@@ -12301,6 +12310,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_xen_init_vcpu(vcpu);
vcpu_load(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
kvm_init_mmu(vcpu);
@@ -12731,6 +12741,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
"does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n");
}
+ once_init(&kvm->arch.nx_once);
return 0;
out_uninit_mmu:
@@ -12740,11 +12751,6 @@ out:
return ret;
}
-int kvm_arch_post_init_vm(struct kvm *kvm)
-{
- return kvm_mmu_post_init_vm(kvm);
-}
-
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
@@ -12800,7 +12806,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *slot;
- /* Called with kvm->slots_lock held. */
+ lockdep_assert_held(&kvm->slots_lock);
+
if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
return ERR_PTR_USR(-EINVAL);
@@ -12833,7 +12840,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
m.guest_phys_addr = gpa;
m.userspace_addr = hva;
m.memory_size = size;
- r = __kvm_set_memory_region(kvm, &m);
+ r = kvm_set_internal_memslot(kvm, &m);
if (r < 0)
return ERR_PTR_USR(r);
}
@@ -12934,7 +12941,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
/*
* Clear out the previous array pointers for the KVM_MR_MOVE case. The
- * old arrays will be freed by __kvm_set_memory_region() if installing
+ * old arrays will be freed by kvm_set_memory_region() if installing
* the new memslot is successful.
*/
memset(&slot->arch, 0, sizeof(slot->arch));
@@ -13027,6 +13034,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
+ if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
+ return -EINVAL;
+
return kvm_alloc_memslot_metadata(kvm, new);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index ec623d23d13d..91e50a513100 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -550,7 +550,6 @@ static inline void kvm_machine_check(void)
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
-bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
@@ -577,6 +576,11 @@ enum kvm_msr_access {
#define KVM_MSR_RET_UNSUPPORTED 2
#define KVM_MSR_RET_FILTERED 3
+static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return !(cr4 & vcpu->arch.cr4_guest_rsvd_bits);
+}
+
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \
u64 __reserved_bits = CR4_RESERVED_BITS; \
@@ -612,4 +616,32 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port, void *data, unsigned int count,
int in);
+static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr)
+{
+ return kvm->arch.hypercall_exit_enabled & BIT(hc_nr);
+}
+
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl,
+ int (*complete_hypercall)(struct kvm_vcpu *));
+
+#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, complete_hypercall) \
+({ \
+ int __ret; \
+ \
+ __ret = ____kvm_emulate_hypercall(_vcpu, \
+ kvm_##nr##_read(_vcpu), kvm_##a0##_read(_vcpu), \
+ kvm_##a1##_read(_vcpu), kvm_##a2##_read(_vcpu), \
+ kvm_##a3##_read(_vcpu), op_64_bit, cpl, \
+ complete_hypercall); \
+ \
+ if (__ret > 0) \
+ __ret = complete_hypercall(_vcpu); \
+ __ret; \
+})
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
#endif