diff options
Diffstat (limited to 'arch/x86/kvm')
64 files changed, 9531 insertions, 2555 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..2eeffcec5382 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM_X86 select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG + select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO @@ -94,6 +95,8 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL + select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -128,6 +131,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ae0b438a2c99..b2d006756e02 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -21,7 +21,7 @@ #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -58,49 +58,30 @@ void __init kvm_init_xstate_sizes(void) u32 xstate_required_size(u64 xstate_bv, bool compacted) { - int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + int i; xstate_bv &= XFEATURE_MASK_EXTEND; - while (xstate_bv) { - if (xstate_bv & 0x1) { - struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; - u32 offset; - - /* ECX[1]: 64B alignment in compacted form */ - if (compacted) - offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; - else - offset = xs->ebx; - ret = max(ret, offset + xs->eax); - } + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + u32 offset; - xstate_bv >>= 1; - feature_bit++; + if (!(xstate_bv & BIT_ULL(i))) + continue; + + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = xs->ebx; + ret = max(ret, offset + xs->eax); + xstate_bv &= ~BIT_ULL(i); } return ret; } -#define F feature_bit - -/* Scattered Flag - For features that are scattered by cpufeatures.h. */ -#define SF(name) \ -({ \ - BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ - (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \ -}) - -/* - * Magic value used by KVM when querying userspace-provided CPUID entries and - * doesn't care about the CPIUD index because the index of the function in - * question is not significant. Note, this magic value must have at least one - * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() - * to avoid false positives when processing guest CPUID input. - */ -#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull - -static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; @@ -150,10 +131,9 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( return NULL; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); -static int kvm_check_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, - int nent) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; u64 xfeatures; @@ -162,8 +142,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, - KVM_CPUID_INDEX_NOT_SIGNIFICANT); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -175,7 +154,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * Exposing dynamic xfeatures to the guest requires additional * enabling in the FPU, e.g. to expand the guest XSAVE state size. */ - best = cpuid_entry2_find(entries, nent, 0xd, 0); + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0); if (!best) return 0; @@ -187,6 +166,9 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } +static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu); +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); + /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) @@ -194,6 +176,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 struct kvm_cpuid_entry2 *orig; int i; + /* + * Apply runtime CPUID updates to the incoming CPUID entries to avoid + * false positives due mismatches on KVM-owned feature flags. + * + * Note! @e2 and @nent track the _old_ CPUID entries! + */ + kvm_update_cpuid_runtime(vcpu); + kvm_apply_cpuid_pv_features_quirk(vcpu); + if (nent != vcpu->arch.cpuid_nent) return -EINVAL; @@ -210,15 +201,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries, - int nent, const char *sig) +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; - for_each_possible_hypervisor_cpuid_base(base) { - entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT); + for_each_possible_cpuid_base_hypervisor(base) { + entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; @@ -238,118 +229,91 @@ static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_e return cpuid; } -static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, - const char *sig) -{ - return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent, sig); -} - -static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, - int nent, u32 kvm_cpuid_base) +static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu) { - return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES, - KVM_CPUID_INDEX_NOT_SIGNIFICANT); -} - -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) -{ - u32 base = vcpu->arch.kvm_cpuid.base; + struct kvm_hypervisor_cpuid kvm_cpuid; + struct kvm_cpuid_entry2 *best; - if (!base) - return NULL; + kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); + if (!kvm_cpuid.base) + return 0; - return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent, base); -} + best = kvm_find_cpuid_entry(vcpu, kvm_cpuid.base | KVM_CPUID_FEATURES); + if (!best) + return 0; -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); + if (kvm_hlt_in_guest(vcpu->kvm)) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); - /* - * save the feature bitmap to avoid cpuid lookup for every PV - * operation - */ - if (best) - vcpu->arch.pv_cpuid.features = best->eax; + return best->eax; } /* * Calculate guest's supported XCR0 taking into account guest CPUID data and * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ -static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) +static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = cpuid_entry2_find(entries, nent, 0xd, 0); + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0); if (!best) return 0; return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } -static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, - int nent) +static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature, + bool has_feature) +{ + cpuid_entry_change(entry, x86_feature, has_feature); + guest_cpu_cap_change(vcpu, x86_feature, has_feature); +} + +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - struct kvm_hypervisor_cpuid kvm_cpuid; - best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); + vcpu->arch.cpuid_dynamic_bits_dirty = false; + + best = kvm_find_cpuid_entry(vcpu, 1); if (best) { - /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) - cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE, kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)); - cpuid_entry_change(best, X86_FEATURE_APIC, - vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_APIC, + vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); + + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_MWAIT, + vcpu->arch.ia32_misc_enable_msr & + MSR_IA32_MISC_ENABLE_MWAIT); } - best = cpuid_entry2_find(entries, nent, 7, 0); - if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) - cpuid_entry_change(best, X86_FEATURE_OSPKE, - kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); + best = kvm_find_cpuid_entry_index(vcpu, 7, 0); + if (best) + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSPKE, + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); + - best = cpuid_entry2_find(entries, nent, 0xD, 0); + best = kvm_find_cpuid_entry_index(vcpu, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = cpuid_entry2_find(entries, nent, 0xD, 1); + best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - - kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE); - if (kvm_cpuid.base) { - best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base); - if (kvm_hlt_in_guest(vcpu->kvm) && best) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); - } - - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); - if (best) - cpuid_entry_change(best, X86_FEATURE_MWAIT, - vcpu->arch.ia32_misc_enable_msr & - MSR_IA32_MISC_ENABLE_MWAIT); - } } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) -{ - __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); -} -EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); - -static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) +static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV struct kvm_cpuid_entry2 *entry; - entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE, - KVM_CPUID_INDEX_NOT_SIGNIFICANT); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; #else return false; @@ -368,15 +332,71 @@ static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx); } -static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +/* + * This isn't truly "unsafe", but except for the cpu_caps initialization code, + * all register lookups should use __cpuid_entry_get_reg(), which provides + * compile-time validation of the input. + */ +static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg) +{ + switch (reg) { + case CPUID_EAX: + return entry->eax; + case CPUID_EBX: + return entry->ebx; + case CPUID_ECX: + return entry->ecx; + case CPUID_EDX: + return entry->edx; + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, + bool include_partially_emulated); + +void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry2 *entry; bool allow_gbpages; + int i; - BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); - bitmap_zero(vcpu->arch.governed_features.enabled, - KVM_MAX_NR_GOVERNED_FEATURES); + memset(vcpu->arch.cpu_caps, 0, sizeof(vcpu->arch.cpu_caps)); + BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS); + + /* + * Reset guest capabilities to userspace's guest CPUID definition, i.e. + * honor userspace's definition for features that don't require KVM or + * hardware management/support (or that KVM simply doesn't care about). + */ + for (i = 0; i < NR_KVM_CPU_CAPS; i++) { + const struct cpuid_reg cpuid = reverse_cpuid[i]; + struct kvm_cpuid_entry2 emulated; + + if (!cpuid.function) + continue; + + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); + if (!entry) + continue; + + cpuid_func_emulated(&emulated, cpuid.function, true); + + /* + * A vCPU has a feature if it's supported by KVM and is enabled + * in guest CPUID. Note, this includes features that are + * supported by KVM but aren't advertised to userspace! + */ + vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] | + cpuid_get_reg_unsafe(&emulated, cpuid.reg); + vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, cpuid.reg); + } + + kvm_update_cpuid_runtime(vcpu); /* * If TDP is enabled, let the guest use GBPAGES if they're supported in @@ -390,9 +410,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * and can install smaller shadow pages if the host lacks 1GiB support. */ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); - if (allow_gbpages) - kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES); + guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { @@ -404,21 +423,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - vcpu->arch.guest_supported_xcr0 = - cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); + vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); - kvm_update_pv_runtime(vcpu); + vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); - vcpu->arch.cr4_guest_rsvd_bits = - __cr4_reserved_bits(guest_cpuid_has, vcpu); - kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent)); +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) | + __cr4_reserved_bits(guest_cpu_cap_has, vcpu); +#undef __kvm_cpu_cap_has + + kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu)); /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_call(vcpu_after_set_cpuid)(vcpu); @@ -444,6 +464,20 @@ not_found: return 36; } +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008); + if (best) + return (best->eax >> 16) & 0xff; +not_found: + return 0; +} + /* * This "raw" version returns the reserved GPA bits without any adjustments for * encryption technologies that usurp bits. The raw mask should be used if and @@ -457,9 +491,25 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { + u32 vcpu_caps[NR_KVM_CPU_CAPS]; int r; - __kvm_update_cpuid_runtime(vcpu, e2, nent); + /* + * Swap the existing (old) entries with the incoming (new) entries in + * order to massage the new entries, e.g. to account for dynamic bits + * that KVM controls, without clobbering the current guest CPUID, which + * KVM needs to preserve in order to unwind on failure. + * + * Similarly, save the vCPU's current cpu_caps so that the capabilities + * can be updated alongside the CPUID entries when performing runtime + * updates. Full initialization is done if and only if the vCPU hasn't + * run, i.e. only if userspace is potentially changing CPUID features. + */ + swap(vcpu->arch.cpuid_entries, e2); + swap(vcpu->arch.cpuid_nent, nent); + + memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps)); + BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); /* * KVM does not correctly handle changing guest CPUID after KVM_RUN, as @@ -475,35 +525,36 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, if (kvm_vcpu_has_run(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) - return r; - - kvfree(e2); - return 0; + goto err; + goto success; } #ifdef CONFIG_KVM_HYPERV - if (kvm_cpuid_has_hyperv(e2, nent)) { + if (kvm_cpuid_has_hyperv(vcpu)) { r = kvm_hv_vcpu_init(vcpu); if (r) - return r; + goto err; } #endif - r = kvm_check_cpuid(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu); if (r) - return r; - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = nent; + goto err; - vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); #ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); #endif kvm_vcpu_after_set_cpuid(vcpu); +success: + kvfree(e2); return 0; + +err: + memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps)); + swap(vcpu->arch.cpuid_entries, e2); + swap(vcpu->arch.cpuid_nent, nent); + return r; } /* when an old userspace process fills a new kernel module */ @@ -582,6 +633,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; @@ -590,107 +644,294 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, return 0; } -/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ -static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) +static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid) { - const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; + u32 base; + + /* + * KVM only supports features defined by Intel (0x0), AMD (0x80000000), + * and Centaur (0xc0000000). WARN if a feature for new vendor base is + * defined, as this and other code would need to be updated. + */ + base = cpuid.function & 0xffff0000; + if (WARN_ON_ONCE(base && base != 0x80000000 && base != 0xc0000000)) + return 0; - reverse_cpuid_check(leaf); + if (cpuid_eax(base) < cpuid.function) + return 0; cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); - kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); + return *__cpuid_entry_get_reg(&entry, cpuid.reg); } -static __always_inline -void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) -{ - /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ - BUILD_BUG_ON(leaf < NCAPINTS); +/* + * For kernel-defined leafs, mask KVM's supported feature set with the kernel's + * capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw + * CPUID, as KVM is the one and only authority (in the kernel). + */ +#define kvm_cpu_cap_init(leaf, feature_initializers...) \ +do { \ + const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \ + const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \ + const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \ + u32 kvm_cpu_cap_passthrough = 0; \ + u32 kvm_cpu_cap_synthesized = 0; \ + u32 kvm_cpu_cap_emulated = 0; \ + u32 kvm_cpu_cap_features = 0; \ + \ + feature_initializers \ + \ + kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \ + \ + if (leaf < NCAPINTS) \ + kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \ + \ + kvm_cpu_caps[leaf] |= kvm_cpu_cap_passthrough; \ + kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) | \ + kvm_cpu_cap_synthesized); \ + kvm_cpu_caps[leaf] |= kvm_cpu_cap_emulated; \ +} while (0) - kvm_cpu_caps[leaf] = mask; +/* + * Assert that the feature bit being declared, e.g. via F(), is in the CPUID + * word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX + * features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX. + */ +#define KVM_VALIDATE_CPU_CAP_USAGE(name) \ +do { \ + u32 __leaf = __feature_leaf(X86_FEATURE_##name); \ + \ + BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \ +} while (0) + +#define F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ + kvm_cpu_cap_features |= feature_bit(name); \ +}) - __kvm_cpu_cap_mask(leaf); -} +/* Scattered Flag - For features that are scattered by cpufeatures.h. */ +#define SCATTERED_F(name) \ +({ \ + BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ + if (boot_cpu_has(X86_FEATURE_##name)) \ + F(name); \ +}) -static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) -{ - /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ - BUILD_BUG_ON(leaf >= NCAPINTS); +/* Features that KVM supports only on 64-bit kernels. */ +#define X86_64_F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ + if (IS_ENABLED(CONFIG_X86_64)) \ + F(name); \ +}) - kvm_cpu_caps[leaf] &= mask; +/* + * Emulated Feature - For features that KVM emulates in software irrespective + * of host CPU/kernel support. + */ +#define EMULATED_F(name) \ +({ \ + kvm_cpu_cap_emulated |= feature_bit(name); \ + F(name); \ +}) - __kvm_cpu_cap_mask(leaf); -} +/* + * Synthesized Feature - For features that are synthesized into boot_cpu_data, + * i.e. may not be present in the raw CPUID, but can still be advertised to + * userspace. Primarily used for mitigation related feature flags. + */ +#define SYNTHESIZED_F(name) \ +({ \ + kvm_cpu_cap_synthesized |= feature_bit(name); \ + F(name); \ +}) + +/* + * Passthrough Feature - For features that KVM supports based purely on raw + * hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't + * use the feature. Simply force set the feature in KVM's capabilities, raw + * CPUID support will be factored in by kvm_cpu_cap_mask(). + */ +#define PASSTHROUGH_F(name) \ +({ \ + kvm_cpu_cap_passthrough |= feature_bit(name); \ + F(name); \ +}) + +/* + * Aliased Features - For features in 0x8000_0001.EDX that are duplicates of + * identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001. + */ +#define ALIASED_1_EDX_F(name) \ +({ \ + BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \ + BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \ + kvm_cpu_cap_features |= feature_bit(name); \ +}) + +/* + * Vendor Features - For features that KVM supports, but are added in later + * because they require additional vendor enabling. + */ +#define VENDOR_F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ +}) + +/* + * Runtime Features - For features that KVM dynamically sets/clears at runtime, + * e.g. when CR4 changes, but which are never advertised to userspace. + */ +#define RUNTIME_F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ +}) + +/* + * Undefine the MSR bit macro to avoid token concatenation issues when + * processing X86_FEATURE_SPEC_CTRL_SSBD. + */ +#undef SPEC_CTRL_SSBD + +/* DS is defined by ptrace-abi.h on 32-bit builds. */ +#undef DS void kvm_set_cpu_caps(void) { -#ifdef CONFIG_X86_64 - unsigned int f_gbpages = F(GBPAGES); - unsigned int f_lm = F(LM); - unsigned int f_xfd = F(XFD); -#else - unsigned int f_gbpages = 0; - unsigned int f_lm = 0; - unsigned int f_xfd = 0; -#endif memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); - memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, - sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps))); - - kvm_cpu_cap_mask(CPUID_1_ECX, + kvm_cpu_cap_init(CPUID_1_ECX, + F(XMM3), + F(PCLMULQDQ), + VENDOR_F(DTES64), /* * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not* - * advertised to guests via CPUID! + * advertised to guests via CPUID! MWAIT is also technically a + * runtime flag thanks to IA32_MISC_ENABLES; mark it as such so + * that KVM is aware that it's a known, unadvertised flag. */ - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | - F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C) | F(RDRAND) + RUNTIME_F(MWAIT), + /* DS-CPL */ + VENDOR_F(VMX), + /* SMX, EST */ + /* TM2 */ + F(SSSE3), + /* CNXT-ID */ + /* Reserved */ + F(FMA), + F(CX16), + /* xTPR Update */ + F(PDCM), + F(PCID), + /* Reserved, DCA */ + F(XMM4_1), + F(XMM4_2), + EMULATED_F(X2APIC), + F(MOVBE), + F(POPCNT), + EMULATED_F(TSC_DEADLINE_TIMER), + F(AES), + F(XSAVE), + RUNTIME_F(OSXSAVE), + F(AVX), + F(F16C), + F(RDRAND), + EMULATED_F(HYPERVISOR), + ); + + kvm_cpu_cap_init(CPUID_1_EDX, + F(FPU), + F(VME), + F(DE), + F(PSE), + F(TSC), + F(MSR), + F(PAE), + F(MCE), + F(CX8), + F(APIC), + /* Reserved */ + F(SEP), + F(MTRR), + F(PGE), + F(MCA), + F(CMOV), + F(PAT), + F(PSE36), + /* PSN */ + F(CLFLUSH), + /* Reserved */ + VENDOR_F(DS), + /* ACPI */ + F(MMX), + F(FXSR), + F(XMM), + F(XMM2), + F(SELFSNOOP), + /* HTT, TM, Reserved, PBE */ ); - /* KVM emulates x2apic in software irrespective of host support. */ - kvm_cpu_cap_set(X86_FEATURE_X2APIC); - - kvm_cpu_cap_mask(CPUID_1_EDX, - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */ + + kvm_cpu_cap_init(CPUID_7_0_EBX, + F(FSGSBASE), + EMULATED_F(TSC_ADJUST), + F(SGX), + F(BMI1), + F(HLE), + F(AVX2), + F(FDP_EXCPTN_ONLY), + F(SMEP), + F(BMI2), + F(ERMS), + F(INVPCID), + F(RTM), + F(ZERO_FCS_FDS), + VENDOR_F(MPX), + F(AVX512F), + F(AVX512DQ), + F(RDSEED), + F(ADX), + F(SMAP), + F(AVX512IFMA), + F(CLFLUSHOPT), + F(CLWB), + VENDOR_F(INTEL_PT), + F(AVX512PF), + F(AVX512ER), + F(AVX512CD), + F(SHA_NI), + F(AVX512BW), + F(AVX512VL), ); - kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | - F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | - F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | - F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | - F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | - F(AVX512VL)); - - kvm_cpu_cap_mask(CPUID_7_ECX, - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | - F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | - F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | - F(SGX_LC) | F(BUS_LOCK_DETECT) + kvm_cpu_cap_init(CPUID_7_ECX, + F(AVX512VBMI), + PASSTHROUGH_F(LA57), + F(PKU), + RUNTIME_F(OSPKE), + F(RDPID), + F(AVX512_VPOPCNTDQ), + F(UMIP), + F(AVX512_VBMI2), + F(GFNI), + F(VAES), + F(VPCLMULQDQ), + F(AVX512_VNNI), + F(AVX512_BITALG), + F(CLDEMOTE), + F(MOVDIRI), + F(MOVDIR64B), + VENDOR_F(WAITPKG), + F(SGX_LC), + F(BUS_LOCK_DETECT), ); - /* Set LA57 based on hardware capability. */ - if (cpuid_ecx(7) & F(LA57)) - kvm_cpu_cap_set(X86_FEATURE_LA57); /* * PKU not yet implemented for shadow paging and requires OSPKE @@ -699,18 +940,25 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); - kvm_cpu_cap_mask(CPUID_7_EDX, - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | - F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) + kvm_cpu_cap_init(CPUID_7_EDX, + F(AVX512_4VNNIW), + F(AVX512_4FMAPS), + F(SPEC_CTRL), + F(SPEC_CTRL_SSBD), + EMULATED_F(ARCH_CAPABILITIES), + F(INTEL_STIBP), + F(MD_CLEAR), + F(AVX512_VP2INTERSECT), + F(FSRM), + F(SERIALIZE), + F(TSXLDTRK), + F(AVX512_FP16), + F(AMX_TILE), + F(AMX_INT8), + F(AMX_BF16), + F(FLUSH_L1D), ); - /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ - kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST); - kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES); - if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && boot_cpu_has(X86_FEATURE_AMD_IBPB) && boot_cpu_has(X86_FEATURE_AMD_IBRS)) @@ -720,65 +968,135 @@ void kvm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); - kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(SHA512) | F(SM3) | F(SM4) | F(AVX_VNNI) | F(AVX512_BF16) | - F(CMPCCXADD) | F(FZRM) | F(FSRS) | F(FSRC) | F(AMX_FP16) | - F(AVX_IFMA) | F(LAM) + kvm_cpu_cap_init(CPUID_7_1_EAX, + F(SHA512), + F(SM3), + F(SM4), + F(AVX_VNNI), + F(AVX512_BF16), + F(CMPCCXADD), + F(FZRM), + F(FSRS), + F(FSRC), + F(WRMSRNS), + F(AMX_FP16), + F(AVX_IFMA), + F(LAM), ); - kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(AMX_COMPLEX) | - F(AVX_VNNI_INT16) | F(PREFETCHITI) | F(AVX10) + kvm_cpu_cap_init(CPUID_7_1_EDX, + F(AVX_VNNI_INT8), + F(AVX_NE_CONVERT), + F(AMX_COMPLEX), + F(AVX_VNNI_INT16), + F(PREFETCHITI), + F(AVX10), ); - kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, - F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) | - F(BHI_CTRL) | F(MCDT_NO) + kvm_cpu_cap_init(CPUID_7_2_EDX, + F(INTEL_PSFD), + F(IPRED_CTRL), + F(RRSBA_CTRL), + F(DDPD_U), + F(BHI_CTRL), + F(MCDT_NO), ); - kvm_cpu_cap_mask(CPUID_D_1_EAX, - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd + kvm_cpu_cap_init(CPUID_D_1_EAX, + F(XSAVEOPT), + F(XSAVEC), + F(XGETBV1), + F(XSAVES), + X86_64_F(XFD), ); - kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, - SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) + kvm_cpu_cap_init(CPUID_12_EAX, + SCATTERED_F(SGX1), + SCATTERED_F(SGX2), + SCATTERED_F(SGX_EDECCSSA), ); - kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, - F(AVX10_128) | F(AVX10_256) | F(AVX10_512) + kvm_cpu_cap_init(CPUID_24_0_EBX, + F(AVX10_128), + F(AVX10_256), + F(AVX10_512), ); - kvm_cpu_cap_mask(CPUID_8000_0001_ECX, - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | 0 /* PERFCTR_CORE */ + kvm_cpu_cap_init(CPUID_8000_0001_ECX, + F(LAHF_LM), + F(CMP_LEGACY), + VENDOR_F(SVM), + /* ExtApicSpace */ + F(CR8_LEGACY), + F(ABM), + F(SSE4A), + F(MISALIGNSSE), + F(3DNOWPREFETCH), + F(OSVW), + /* IBS */ + F(XOP), + /* SKINIT, WDT, LWP */ + F(FMA4), + F(TBM), + F(TOPOEXT), + VENDOR_F(PERFCTR_CORE), ); - kvm_cpu_cap_mask(CPUID_8000_0001_EDX, - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) + kvm_cpu_cap_init(CPUID_8000_0001_EDX, + ALIASED_1_EDX_F(FPU), + ALIASED_1_EDX_F(VME), + ALIASED_1_EDX_F(DE), + ALIASED_1_EDX_F(PSE), + ALIASED_1_EDX_F(TSC), + ALIASED_1_EDX_F(MSR), + ALIASED_1_EDX_F(PAE), + ALIASED_1_EDX_F(MCE), + ALIASED_1_EDX_F(CX8), + ALIASED_1_EDX_F(APIC), + /* Reserved */ + F(SYSCALL), + ALIASED_1_EDX_F(MTRR), + ALIASED_1_EDX_F(PGE), + ALIASED_1_EDX_F(MCA), + ALIASED_1_EDX_F(CMOV), + ALIASED_1_EDX_F(PAT), + ALIASED_1_EDX_F(PSE36), + /* Reserved */ + F(NX), + /* Reserved */ + F(MMXEXT), + ALIASED_1_EDX_F(MMX), + ALIASED_1_EDX_F(FXSR), + F(FXSR_OPT), + X86_64_F(GBPAGES), + F(RDTSCP), + /* Reserved */ + X86_64_F(LM), + F(3DNOWEXT), + F(3DNOW), ); if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); - kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, - SF(CONSTANT_TSC) + kvm_cpu_cap_init(CPUID_8000_0007_EDX, + SCATTERED_F(CONSTANT_TSC), ); - kvm_cpu_cap_mask(CPUID_8000_0008_EBX, - F(CLZERO) | F(XSAVEERPTR) | - F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - F(AMD_PSFD) | F(AMD_IBPB_RET) + kvm_cpu_cap_init(CPUID_8000_0008_EBX, + F(CLZERO), + F(XSAVEERPTR), + F(WBNOINVD), + F(AMD_IBPB), + F(AMD_IBRS), + F(AMD_SSBD), + F(VIRT_SSBD), + F(AMD_SSB_NO), + F(AMD_STIBP), + F(AMD_STIBP_ALWAYS_ON), + F(AMD_IBRS_SAME_MODE), + F(AMD_PSFD), + F(AMD_IBPB_RET), ); /* @@ -808,50 +1126,77 @@ void kvm_set_cpu_caps(void) !boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* - * Hide all SVM features by default, SVM will set the cap bits for - * features it emulates and/or exposes for L1. - */ - kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); - - kvm_cpu_cap_mask(CPUID_8000_001F_EAX, - 0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ | - F(SME_COHERENT)); + /* All SVM features required additional vendor module enabling. */ + kvm_cpu_cap_init(CPUID_8000_000A_EDX, + VENDOR_F(NPT), + VENDOR_F(VMCBCLEAN), + VENDOR_F(FLUSHBYASID), + VENDOR_F(NRIPS), + VENDOR_F(TSCRATEMSR), + VENDOR_F(V_VMSAVE_VMLOAD), + VENDOR_F(LBRV), + VENDOR_F(PAUSEFILTER), + VENDOR_F(PFTHRESHOLD), + VENDOR_F(VGIF), + VENDOR_F(VNMI), + VENDOR_F(SVME_ADDR_CHK), + ); - kvm_cpu_cap_mask(CPUID_8000_0021_EAX, - F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | - F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | - F(WRMSR_XX_BASE_NS) + kvm_cpu_cap_init(CPUID_8000_001F_EAX, + VENDOR_F(SME), + VENDOR_F(SEV), + /* VM_PAGE_FLUSH */ + VENDOR_F(SEV_ES), + F(SME_COHERENT), ); - kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); - kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); - kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_init(CPUID_8000_0021_EAX, + F(NO_NESTED_DATA_BP), + F(WRMSR_XX_BASE_NS), + /* + * Synthesize "LFENCE is serializing" into the AMD-defined entry + * in KVM's supported CPUID, i.e. if the feature is reported as + * supported by the kernel. LFENCE_RDTSC was a Linux-defined + * synthetic feature long before AMD joined the bandwagon, e.g. + * LFENCE is serializing on most CPUs that support SSE2. On + * CPUs that don't support AMD's leaf, ANDing with the raw host + * CPUID will drop the flags, and reporting support in AMD's + * leaf can make it easier for userspace to detect the feature. + */ + SYNTHESIZED_F(LFENCE_RDTSC), + /* SmmPgCfgLock */ + F(NULL_SEL_CLR_BASE), + /* UpperAddressIgnore */ + F(AUTOIBRS), + F(PREFETCHI), + EMULATED_F(NO_SMM_CTL_MSR), + /* PrefetchCtlMsr */ + /* GpOnUserCpuid */ + /* EPSF */ + SYNTHESIZED_F(SBPB), + SYNTHESIZED_F(IBPB_BRTYPE), + SYNTHESIZED_F(SRSO_NO), + F(SRSO_USER_KERNEL_NO), + ); - kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, - F(PERFMON_V2) + kvm_cpu_cap_init(CPUID_8000_0022_EAX, + F(PERFMON_V2), ); - /* - * Synthesize "LFENCE is serializing" into the AMD-defined entry in - * KVM's supported CPUID if the feature is reported as supported by the - * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long - * before AMD joined the bandwagon, e.g. LFENCE is serializing on most - * CPUs that support SSE2. On CPUs that don't support AMD's leaf, - * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing - * the mask with the raw host CPUID, and reporting support in AMD's - * leaf can make it easier for userspace to detect the feature. - */ - if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE); - kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR); - kvm_cpu_cap_mask(CPUID_C000_0001_EDX, - F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | - F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | - F(PMM) | F(PMM_EN) + kvm_cpu_cap_init(CPUID_C000_0001_EDX, + F(XSTORE), + F(XSTORE_EN), + F(XCRYPT), + F(XCRYPT_EN), + F(ACE2), + F(ACE2_EN), + F(PHE), + F(PHE_EN), + F(PMM), + F(PMM_EN), ); /* @@ -871,6 +1216,16 @@ void kvm_set_cpu_caps(void) } EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); +#undef F +#undef SCATTERED_F +#undef X86_64_F +#undef EMULATED_F +#undef SYNTHESIZED_F +#undef PASSTHROUGH_F +#undef ALIASED_1_EDX_F +#undef VENDOR_F +#undef RUNTIME_F + struct kvm_cpuid_array { struct kvm_cpuid_entry2 *entries; int maxnent; @@ -928,14 +1283,11 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, return entry; } -static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) +static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, + bool include_partially_emulated) { - struct kvm_cpuid_entry2 *entry; - - if (array->nent >= array->maxnent) - return -E2BIG; + memset(entry, 0, sizeof(*entry)); - entry = &array->entries[array->nent]; entry->function = func; entry->index = 0; entry->flags = 0; @@ -943,23 +1295,37 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) switch (func) { case 0: entry->eax = 7; - ++array->nent; - break; + return 1; case 1: - entry->ecx = F(MOVBE); - ++array->nent; - break; + entry->ecx = feature_bit(MOVBE); + /* + * KVM allows userspace to enumerate MONITOR+MWAIT support to + * the guest, but the MWAIT feature flag is never advertised + * to userspace because MONITOR+MWAIT aren't virtualized by + * hardware, can't be faithfully emulated in software (KVM + * emulates them as NOPs), and allowing the guest to execute + * them natively requires enabling a per-VM capability. + */ + if (include_partially_emulated) + entry->ecx |= feature_bit(MWAIT); + return 1; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->eax = 0; if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) - entry->ecx = F(RDPID); - ++array->nent; - break; + entry->ecx = feature_bit(RDPID); + return 1; default: - break; + return 0; } +} + +static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) +{ + if (array->nent >= array->maxnent) + return -E2BIG; + array->nent += cpuid_func_emulated(&array->entries[array->nent], func, false); return 0; } @@ -1053,8 +1419,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1070,8 +1436,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1103,7 +1467,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_D_1_EAX); - if (entry->eax & (F(XSAVES)|F(XSAVEC))) + if (entry->eax & (feature_bit(XSAVES) | feature_bit(XSAVEC))) entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { @@ -1334,7 +1698,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) phys_as = entry->eax & 0xff; g_phys_as = phys_as; if (kvm_mmu_get_max_tdp_level() < 5) - g_phys_as = min(g_phys_as, 48); + g_phys_as = min(g_phys_as, 48U); } entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); @@ -1389,23 +1753,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { - entry->eax = entry->ebx; + entry->eax = entry->ebx = 0; break; } cpuid_entry_override(entry, CPUID_8000_0022_EAX); - if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) - ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; - else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; - else - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; - + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; entry->ebx = ebx.full; break; } @@ -1540,22 +1898,6 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, - function, index); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function) -{ - return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, - function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - /* * Intel CPUID semantics treats any query for an out-of-range leaf as if the * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics @@ -1631,6 +1973,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; @@ -1646,12 +1991,29 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, *edx = entry->edx; if (function == 7 && index == 0) { u64 data; - if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && + !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) - *ebx &= ~(F(RTM) | F(HLE)); + *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { if (kvm_hv_invtsc_suppressed(vcpu)) - *edx &= ~SF(CONSTANT_TSC); + *edx &= ~feature_bit(CONSTANT_TSC); + } else if (IS_ENABLED(CONFIG_KVM_XEN) && + kvm_xen_is_tsc_leaf(vcpu, function)) { + /* + * Update guest TSC frequency information if necessary. + * Ignore failures, there is no sane value that can be + * provided if KVM can't get the TSC frequency. + */ + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) + kvm_guest_time_update(vcpu); + + if (index == 1) { + *ecx = vcpu->arch.pvclock_tsc_mul; + *edx = vcpu->arch.pvclock_tsc_shift; + } else if (index == 2) { + *eax = vcpu->arch.hw_tsc_khz; + } } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f16a7b2c2adc..d3f5ae15a7ca 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -10,12 +10,35 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function); +void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, + int nent, u32 function, u64 index); +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2() + * to avoid false positives when processing guest CPUID input. + * + * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of + * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry(). + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); +} + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} + int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -35,6 +58,7 @@ void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) @@ -67,41 +91,40 @@ static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, *reg = kvm_cpu_caps[leaf]; } -static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; + u32 *reg; + + /* + * XSAVES is a special snowflake. Due to lack of a dedicated intercept + * on SVM, KVM must assume that XSAVES (and thus XRSTORS) is usable by + * the guest if the host supports XSAVES and *XSAVE* is exposed to the + * guest. Because the guest can execute XSAVES and XRSTORS, i.e. can + * indirectly consume XSS, KVM must ensure XSS is zeroed when running + * the guest, i.e. must set XSAVES in vCPU capabilities. But to reject + * direct XSS reads and writes (to minimize the virtualization hole and + * honor userspace's CPUID), KVM needs to check the raw guest CPUID, + * not KVM's view of guest capabilities. + * + * For all other features, guest capabilities are accurate. Expand + * this allowlist with extreme vigilance. + */ + BUILD_BUG_ON(x86_feature != X86_FEATURE_XSAVES); entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; - return __cpuid_entry_get_reg(entry, cpuid.reg); -} - -static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, - unsigned int x86_feature) -{ - u32 *reg; - - reg = guest_cpuid_get_register(vcpu, x86_feature); + reg = __cpuid_entry_get_reg(entry, cpuid.reg); if (!reg) return false; return *reg & __feature_bit(x86_feature); } -static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, - unsigned int x86_feature) -{ - u32 *reg; - - reg = guest_cpuid_get_register(vcpu, x86_feature); - if (reg) - *reg &= ~__feature_bit(x86_feature); -} - static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) { return vcpu->arch.is_amd_compatible; @@ -150,21 +173,6 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } -static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) -{ - return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); -} - -static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) -{ - return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || - guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); -} - static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; @@ -180,7 +188,6 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } @@ -188,7 +195,6 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } @@ -196,7 +202,6 @@ static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); } @@ -220,58 +225,69 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } -enum kvm_governed_features { -#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, -#include "governed_features.h" - KVM_NR_GOVERNED_FEATURES -}; - -static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) +static __always_inline void guest_cpu_cap_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - switch (x86_feature) { -#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; -#include "governed_features.h" - default: - return -1; - } -} + unsigned int x86_leaf = __feature_leaf(x86_feature); -static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) -{ - return kvm_governed_feature_index(x86_feature) >= 0; + vcpu->arch.cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } -static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline void guest_cpu_cap_clear(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + unsigned int x86_leaf = __feature_leaf(x86_feature); - __set_bit(kvm_governed_feature_index(x86_feature), - vcpu->arch.governed_features.enabled); + vcpu->arch.cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } -static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline void guest_cpu_cap_change(struct kvm_vcpu *vcpu, + unsigned int x86_feature, + bool guest_has_cap) { - if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) - kvm_governed_feature_set(vcpu, x86_feature); + if (guest_has_cap) + guest_cpu_cap_set(vcpu, x86_feature); + else + guest_cpu_cap_clear(vcpu, x86_feature); } -static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + unsigned int x86_leaf = __feature_leaf(x86_feature); - return test_bit(kvm_governed_feature_index(x86_feature), - vcpu->arch.governed_features.enabled); + /* + * Except for MWAIT, querying dynamic feature bits is disallowed, so + * that KVM can defer runtime updates until the next CPUID emulation. + */ + BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC || + x86_feature == X86_FEATURE_OSXSAVE || + x86_feature == X86_FEATURE_OSPKE); + + return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature); } static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (guest_can_use(vcpu, X86_FEATURE_LAM)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LAM)) cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); return kvm_vcpu_is_legal_gpa(vcpu, cr3); } +static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_STIBP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_SSBD)); +} + +static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 60986f67c35a..1349e278cd2a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -477,8 +477,11 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, + .src_type = ctxt->src.type, + .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, - .next_rip = ctxt->eip, + .rip = ctxt->eip, + .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h deleted file mode 100644 index ad463b1ed4e4..000000000000 --- a/arch/x86/kvm/governed_features.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE) -BUILD_BUG() -#endif - -#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) - -KVM_GOVERNED_X86_FEATURE(GBPAGES) -KVM_GOVERNED_X86_FEATURE(XSAVES) -KVM_GOVERNED_X86_FEATURE(VMX) -KVM_GOVERNED_X86_FEATURE(NRIPS) -KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) -KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) -KVM_GOVERNED_X86_FEATURE(LBRV) -KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) -KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) -KVM_GOVERNED_X86_FEATURE(VGIF) -KVM_GOVERNED_X86_FEATURE(VNMI) -KVM_GOVERNED_X86_FEATURE(LAM) - -#undef KVM_GOVERNED_X86_FEATURE -#undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4f0a94346d00..24f0318c50d7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -952,8 +952,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } @@ -1352,7 +1351,7 @@ static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) return; if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVEC)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) return; pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " @@ -2226,6 +2225,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2854,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index cd57a517d04a..739aa6c0d0c3 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr); if (IS_ERR(pit->worker)) goto fail_kthread; @@ -690,8 +690,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->kvm = kvm; pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; + hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8dec646e764b..a8fb19940975 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -567,7 +567,7 @@ static void pic_irq_request(struct kvm *kvm, int level) { struct kvm_pic *s = kvm->arch.vpic; - if (!s->output) + if (!s->output && level) s->wakeup_needed = true; s->output = level; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 995eb5054360..45dae2d5d2f1 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - e->fields.dest_id, dm) || - kvm_apic_pending_eoi(vcpu, e->fields.vector)) - __set_bit(e->fields.vector, - ioapic_handled_vectors); + kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm, + e->fields.vector, ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 539333ac4b38..aa8cb4ac0479 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,4 +120,6 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 63f66c51975a..97d68d837929 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -100,6 +100,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; + if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected) + return kvm_x86_call(protected_apic_has_interrupt)(v); + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8136695f7b96..d6d792b5d1bd 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -402,6 +402,33 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors) +{ + /* + * Intercept EOI if the vCPU is the target of the new IRQ routing, or + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU + * may receive a level-triggered IRQ in the future, or already received + * level-triggered IRQ. The EOI needs to be intercepted and forwarded + * to I/O APIC emulation so that the IRQ can be de-asserted. + */ + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { + __set_bit(vector, ioapic_handled_vectors); + } else if (kvm_apic_pending_eoi(vcpu, vector)) { + __set_bit(vector, ioapic_handled_vectors); + + /* + * Track the highest pending EOI for which the vCPU is NOT the + * target in the new routing. Only the EOI for the IRQ that is + * in-flight (for the old routing) needs to be intercepted, any + * future IRQs that arrive on this vCPU will be coincidental to + * the level-triggered routing and don't need to be intercepted. + */ + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; + } +} + void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { @@ -424,11 +451,11 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.trig_mode && - (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode) || - kvm_apic_pending_eoi(vcpu, irq.vector))) - __set_bit(irq.vector, ioapic_handled_vectors); + if (!irq.trig_mode) + continue; + + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, + irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 10495fffb890..c1df5acfacaf 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -44,7 +44,10 @@ struct x86_instruction_info { u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ + u8 src_type; /* type of source operand */ + u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ + u64 rip; /* rip of the instruction */ u64 next_rip; /* rip following the instruction */ }; @@ -88,6 +91,8 @@ struct x86_instruction_info { #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ +/* Emulation during event vectoring is unhandleable. */ +#define X86EMUL_UNHANDLEABLE_VECTORING 7 /* x86-specific emulation flags */ #define X86EMUL_F_WRITE BIT(0) @@ -270,8 +275,10 @@ struct operand { }; }; +#define X86_MAX_INSTRUCTION_LENGTH 15 + struct fetch_cache { - u8 data[15]; + u8 data[X86_MAX_INSTRUCTION_LENGTH]; u8 *ptr; u8 *end; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3c83951c619e..73418dc0ebb2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -221,13 +221,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) @@ -489,7 +482,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree_rcu(old, rcu); kvm_make_scan_ioapic_request(kvm); } @@ -598,7 +591,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); @@ -662,27 +655,29 @@ static u8 count_vectors(void *bitmap) return count; } -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { + unsigned long pir_vals[NR_PIR_WORDS]; + u32 *__pir = (void *)pir_vals; u32 i, vec; - u32 pir_val, irr_val, prev_irr_val; + u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; + if (!pi_harvest_pir(pir, pir_vals)) + return false; + for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); - irr_val = *p_irr; - pir_val = READ_ONCE(pir[i]); - - if (pir_val) { - pir_val = xchg(&pir[i], 0); + irr_val = READ_ONCE(*p_irr); + if (__pir[i]) { prev_irr_val = irr_val; do { - irr_val = prev_irr_val | pir_val; + irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); @@ -698,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); @@ -734,10 +729,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { if (unlikely(apic->apicv_active)) { - /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_x86_call(hwapic_irr_update)(apic->vcpu, - apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -763,7 +755,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(apic->apicv_active)) - kvm_x86_call(hwapic_isr_update)(vec); + kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -808,7 +800,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(apic->apicv_active)) - kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic)); + kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -816,6 +808,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) + return; + + kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); +} +EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { /* This may race with setting of irr in __apic_accept_irq() and @@ -1458,6 +1461,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) if (!kvm_ioapic_handles_vector(apic, vector)) return; + /* + * If the intercepted EOI is for an IRQ that was pending from previous + * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely + * no longer need to be intercepted. + */ + if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector) + kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu); + /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; @@ -1789,8 +1800,17 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); + u32 reg; + /* + * Assume a timer IRQ was "injected" if the APIC is protected. KVM's + * copy of the vIRR is bogus, it's the responsibility of the caller to + * precisely check whether or not a timer IRQ is pending. + */ + if (apic->guest_apic_protected) + return true; + + reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; @@ -2357,7 +2377,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); + val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask); kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); break; @@ -2585,7 +2605,7 @@ static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; if (!apic) return; @@ -2634,7 +2654,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) return 0; u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | - (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); + (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) return 1; @@ -2649,6 +2669,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) kvm_recalculate_apic_map(vcpu->kvm); return 0; } +EXPORT_SYMBOL_GPL(kvm_apic_set_base); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { @@ -2805,8 +2826,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_update_ppr(apic); if (apic->apicv_active) { kvm_x86_call(apicv_post_state_restore)(vcpu); - kvm_x86_call(hwapic_irr_update)(vcpu, -1); - kvm_x86_call(hwapic_isr_update)(-1); + kvm_x86_call(hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2914,9 +2934,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - apic->lapic_timer.timer.function = apic_timer_fn; + hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; @@ -2959,6 +2978,9 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) if (!kvm_apic_present(vcpu)) return -1; + if (apic->guest_apic_protected) + return -1; + __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } @@ -3121,9 +3143,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); if (apic->apicv_active) { kvm_x86_call(apicv_post_state_restore)(vcpu); - kvm_x86_call(hwapic_irr_update)(vcpu, - apic_find_highest_irr(apic)); - kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic)); + kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -3392,9 +3412,9 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -3403,7 +3423,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) sipi_vector = apic->sipi_vector; kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } } return 0; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 24add38beaf0..4ce30db65828 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -65,6 +65,8 @@ struct kvm_lapic { bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; + /* Select registers in the vAPIC cannot be read/written. */ + bool guest_apic_protected; /* Number of bits set in ISR. */ s16 isr_count; /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -101,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); @@ -118,6 +120,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9322358678b..b4b6860ab971 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void) u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); @@ -104,6 +105,18 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + + /* + * Checking root.hpa is sufficient even when KVM has mirror root. + * We can have either: + * (1) mirror_root_hpa = INVALID_PAGE, root.hpa = INVALID_PAGE + * (2) mirror_root_hpa = root, root.hpa = INVALID_PAGE + * (3) mirror_root_hpa = root1, root.hpa = root2 + * We don't ever have: + * mirror_root_hpa = INVALID_PAGE, root.hpa = root + */ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE)) return 0; @@ -126,7 +139,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu) { - if (!guest_can_use(vcpu, X86_FEATURE_LAM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LAM)) return 0; return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57); @@ -222,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool kvm_mmu_may_ignore_guest_pat(void); +bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); @@ -244,6 +257,9 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); + static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); @@ -287,4 +303,26 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, return gpa; return translate_nested_gpa(vcpu, gpa, access, exception); } + +static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static inline gfn_t kvm_gfn_direct_bits(const struct kvm *kvm) +{ + return kvm->arch.gfn_direct_bits; +} + +static inline bool kvm_is_addr_direct(struct kvm *kvm, gpa_t gpa) +{ + gpa_t gpa_direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(kvm)); + + return !gpa_direct_bits || (gpa & gpa_direct_bits); +} + +static inline bool kvm_is_gfn_alias(struct kvm *kvm, gfn_t gfn) +{ + return gfn & kvm_gfn_direct_bits(kvm); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2401606db260..4e06e2e89a8f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -501,7 +502,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return false; } - if (!spte_has_volatile_bits(old_spte)) + if (!spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); @@ -524,7 +525,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || - !spte_has_volatile_bits(old_spte)) + !spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); @@ -599,6 +600,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; + if (kvm_has_mirrored_tdp(vcpu->kvm)) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); if (r) @@ -618,6 +625,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } @@ -846,32 +854,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings. */ #define KVM_RMAP_MANY BIT(0) /* + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always + * operates with mmu_lock held for write), but rmaps can be walked without + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain + * being zapped/dropped _while the rmap is locked_. + * + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be + * done while holding mmu_lock for write. This allows a task walking rmaps + * without holding mmu_lock to concurrently walk the same entries as a task + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify + * the rmaps, thus the walks are stable. + * + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, + * only the rmap chains themselves are protected. E.g. holding an rmap's lock + * ensures all "struct pte_list_desc" fields are stable. + */ +#define KVM_RMAP_LOCKED BIT(1) + +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + unsigned long old_val, new_val; + + lockdep_assert_preemption_disabled(); + + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ + old_val = atomic_long_read(&rmap_head->val); + if (!old_val) + return 0; + + do { + /* + * If the rmap is locked, wait for it to be unlocked before + * trying acquire the lock, e.g. to avoid bouncing the cache + * line. + */ + while (old_val & KVM_RMAP_LOCKED) { + cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); + } + + /* + * Recheck for an empty rmap, it may have been purged by the + * task that held the lock. + */ + if (!old_val) + return 0; + + new_val = old_val | KVM_RMAP_LOCKED; + /* + * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap + * from being reordered outside of the critical section created by + * __kvm_rmap_lock(). + * + * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). + * + * For the !old_val case, no ordering is needed, as there is no rmap + * to walk. + */ + } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val)); + + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ + return old_val; +} + +static unsigned long kvm_rmap_lock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + return __kvm_rmap_lock(rmap_head); +} + +static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, + unsigned long val) +{ + KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); + /* + * Ensure that all accesses to the rmap have completed before unlocking + * the rmap. + * + * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock(). + */ + atomic_long_set_release(&rmap_head->val, val); +} + +static void kvm_rmap_unlock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + unsigned long new_val) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + __kvm_rmap_unlock(rmap_head, new_val); +} + +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) +{ + return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED; +} + +/* + * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The + * actual locking is the same, but the caller is disallowed from modifying the + * rmap, and so the unlock flow is a nop if the rmap is/was empty. + */ +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + unsigned long rmap_val; + + preempt_disable(); + rmap_val = __kvm_rmap_lock(rmap_head); + + if (!rmap_val) + preempt_enable(); + + return rmap_val; +} + +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); + + __kvm_rmap_unlock(rmap_head, old_val); + preempt_enable(); +} + +/* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, - struct kvm_rmap_head *rmap_head) +static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + u64 *spte, struct kvm_rmap_head *rmap_head) { + unsigned long old_val, new_val; struct pte_list_desc *desc; int count = 0; - if (!rmap_head->val) { - rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & KVM_RMAP_MANY)) { + old_val = kvm_rmap_lock(kvm, rmap_head); + + if (!old_val) { + new_val = (unsigned long)spte; + } else if (!(old_val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->sptes[0] = (u64 *)rmap_head->val; + desc->sptes[0] = (u64 *)old_val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -880,21 +1029,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; + } else { + new_val = old_val; } desc->sptes[desc->spte_count++] = spte; } + + kvm_rmap_unlock(kvm, rmap_head, new_val); + return count; } -static void pte_list_desc_remove_entry(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -921,9 +1074,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) - rmap_head->val = 0; + *rmap_val = 0; else - rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; + *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -931,24 +1084,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; + unsigned long rmap_val; int i; - if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) - return; + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) + goto out; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) - return; + if (!(rmap_val & KVM_RMAP_MANY)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm)) + goto out; - rmap_head->val = 0; + rmap_val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(kvm, rmap_head, + pte_list_desc_remove_entry(kvm, &rmap_val, desc, i); - return; + goto out; } } desc = desc->more; @@ -956,6 +1111,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } + +out: + kvm_rmap_unlock(kvm, rmap_head, rmap_val); } static void kvm_zap_one_rmap_spte(struct kvm *kvm, @@ -970,17 +1128,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; + unsigned long rmap_val; int i; - if (!rmap_head->val) + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (!rmap_val) return false; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + if (!(rmap_val & KVM_RMAP_MANY)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -990,20 +1150,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + kvm_rmap_unlock(kvm, rmap_head, 0); return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { + unsigned long rmap_val = kvm_rmap_get(rmap_head); struct pte_list_desc *desc; - if (!rmap_head->val) + if (!rmap_val) return 0; - else if (!(rmap_head->val & KVM_RMAP_MANY)) + else if (!(rmap_val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1046,6 +1207,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) */ struct rmap_iterator { /* private fields */ + struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; @@ -1060,23 +1222,19 @@ struct rmap_iterator { static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { - u64 *sptep; + unsigned long rmap_val = kvm_rmap_get(rmap_head); - if (!rmap_head->val) + if (!rmap_val) return NULL; - if (!(rmap_head->val & KVM_RMAP_MANY)) { + if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_head->val; - goto out; + return (u64 *)rmap_val; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1086,14 +1244,11 @@ out: */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1101,20 +1256,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \ + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1200,12 +1359,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (spte_ad_need_write_protect(*sptep)) flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); + } return flush; } @@ -1297,15 +1457,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ - if (kvm_x86_ops.cpu_dirty_log_size) + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, @@ -1394,7 +1554,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); - if (iterator->rmap->val) + if (atomic_long_read(&iterator->rmap->val)) return; } @@ -1526,7 +1686,7 @@ static void __rmap_add(struct kvm *kvm, kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(cache, spte, rmap_head); + rmap_count = pte_list_add(kvm, cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; @@ -1545,51 +1705,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, bool test_only) + struct kvm_gfn_range *range, + bool test_only) { - struct slot_rmap_walk_iterator iterator; + struct kvm_rmap_head *rmap_head; struct rmap_iterator iter; + unsigned long rmap_val; bool young = false; u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) { - for_each_rmap_spte(iterator.rmap, &iter, sptep) { - u64 spte = *sptep; + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); - if (!is_accessed_spte(spte)) - continue; + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; - if (test_only) - return true; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * WARN if mmu_spte_update() signals the need - * for a TLB flush, as Access tracking a SPTE - * should never trigger an _immediate_ flush. - */ - spte = mark_spte_for_access_track(spte); - WARN_ON_ONCE(mmu_spte_update(sptep, spte)); + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } + + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; } - young = true; + + kvm_rmap_unlock_readonly(rmap_head, rmap_val); } } return young; } +static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm) +{ + return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, false); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_age_gfn_range(kvm, range); + young = kvm_tdp_mmu_age_gfn_range(kvm, range); + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, false); return young; } @@ -1598,11 +1774,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, true); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_test_age_gfn(kvm, range); + young = kvm_tdp_mmu_test_age_gfn(kvm, range); + + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, true); return young; } @@ -1649,13 +1828,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, +static void mmu_page_add_parent_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(cache, parent_pte, &sp->parent_ptes); + pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -2345,7 +2525,7 @@ static void __link_shadow_page(struct kvm *kvm, mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(cache, sp, sptep); + mmu_page_add_parent_pte(kvm, cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For @@ -2409,7 +2589,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.guest_mode && !child->parent_ptes.val) + child->role.guest_mode && + !atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } @@ -2839,7 +3020,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { - if (prefetch) + if (prefetch && is_last_spte(*sptep, level) && + pfn == spte_to_pfn(*sptep)) return RET_PF_SPURIOUS; /* @@ -2853,7 +3035,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; - } else if (pfn != spte_to_pfn(*sptep)) { + } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) { drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -3656,8 +3838,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) unsigned i; int r; - if (tdp_mmu_enabled) - return kvm_tdp_mmu_alloc_root(vcpu); + if (tdp_mmu_enabled) { + if (kvm_has_mirrored_tdp(vcpu->kvm) && + !VALID_PAGE(mmu->mirror_root_hpa)) + kvm_tdp_mmu_alloc_root(vcpu, true); + kvm_tdp_mmu_alloc_root(vcpu, false); + return 0; + } write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); @@ -4379,8 +4566,12 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; + struct kvm *kvm = vcpu->kvm; int ret; + if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm)) + return -EFAULT; + /* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an @@ -4394,7 +4585,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch. */ - if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } @@ -4456,7 +4647,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ - if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_mmu_faultin_pfn(vcpu, fault); @@ -4476,7 +4667,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, * overall cost of failing to detect the invalidation until after * mmu_lock is acquired. */ - if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { + if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) { kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY; } @@ -4646,19 +4837,6 @@ out_unlock: } #endif -bool kvm_mmu_may_ignore_guest_pat(void) -{ - /* - * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM - * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to - * honor the memtype from the guest's PAT so that guest accesses to - * memory that is DMA'd aren't cached against the guest's wishes. As a - * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, - * KVM _always_ ignores guest PAT (when EPT is enabled). - */ - return shadow_memtype_mask; -} - int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 @@ -4669,8 +4847,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; @@ -4684,6 +4861,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, do { if (signal_pending(current)) return -EINTR; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); @@ -4708,18 +4889,23 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, return -EIO; } } +EXPORT_SYMBOL_GPL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; + u64 direct_bits; u64 end; int r; if (!vcpu->kvm->arch.pre_fault_allowed) return -EOPNOTSUPP; + if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa))) + return -EINVAL; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. @@ -4728,15 +4914,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, if (r) return r; + direct_bits = 0; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; + else + direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; @@ -5022,7 +5211,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use(vcpu, X86_FEATURE_GBPAGES), + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_compatible(vcpu)); } @@ -5099,7 +5288,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use(vcpu, X86_FEATURE_GBPAGES), + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_pse, is_amd); if (!shadow_me_mask) @@ -5400,12 +5589,19 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + int maxpa; + + if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM) + maxpa = cpuid_query_maxguestphyaddr(vcpu); + else + maxpa = cpuid_maxphyaddr(vcpu); + /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ - if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && maxpa <= 48) return 4; return max_tdp_level; @@ -5524,7 +5720,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); @@ -5724,6 +5920,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -5785,6 +5982,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -6095,8 +6293,16 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err else if (r == RET_PF_SPURIOUS) vcpu->stat.pf_spurious++; + /* + * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or + * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE. + * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to + * indicate continuing the page fault handling until to the final + * page table mapping phase. + */ + WARN_ON_ONCE(r == RET_PF_CONTINUE); if (r != RET_PF_EMULATE) - return 1; + return r; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, @@ -6272,6 +6478,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; + mmu->mirror_root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -6441,8 +6648,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (tdp_mmu_enabled) - kvm_tdp_mmu_invalidate_all_roots(kvm); + if (tdp_mmu_enabled) { + /* + * External page tables don't support fast zapping, therefore + * their mirrors must be invalidated separately by the caller. + */ + kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS); + } /* * Notify all vcpus to reload its shadow page table and flush TLB. @@ -6467,7 +6679,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * lead to use-after-free. */ if (tdp_mmu_enabled) - kvm_tdp_mmu_zap_invalidated_roots(kvm); + kvm_tdp_mmu_zap_invalidated_roots(kvm, true); } void kvm_mmu_init_vm(struct kvm *kvm) @@ -7035,6 +7247,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, }; bool flush; @@ -7090,6 +7303,19 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_wake_nx_recovery_thread(struct kvm *kvm) +{ + /* + * The NX recovery thread is spawned on-demand at the first KVM_RUN and + * may not be valid even though the VM is globally visible. Do nothing, + * as such a VM can't have any possible NX huge pages. + */ + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); + + if (nx_thread) + vhost_task_wake(nx_thread); +} + static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) @@ -7150,7 +7376,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } @@ -7220,6 +7446,12 @@ out: void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); + if (tdp_mmu_enabled) { + read_lock(&vcpu->kvm->mmu_lock); + mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa, + NULL); + read_unlock(&vcpu->kvm->mmu_lock); + } free_mmu_pages(&vcpu->arch.root_mmu); free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); @@ -7279,7 +7511,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } @@ -7411,23 +7643,35 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -int kvm_mmu_post_init_vm(struct kvm *kvm) +static int kvm_mmu_start_lpage_recovery(struct once *once) { - if (nx_hugepage_mitigation_hard_disabled) - return 0; + struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct vhost_task *nx_thread; kvm->arch.nx_huge_page_last = get_jiffies_64(); - kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( - kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, - kvm, "kvm-nx-lpage-recovery"); + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, + kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; + if (IS_ERR(nx_thread)) + return PTR_ERR(nx_thread); - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_start(nx_thread); + + /* Make the task visible only once it is fully started. */ + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); return 0; } +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + if (nx_hugepage_mitigation_hard_disabled) + return 0; + + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); +} + void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) @@ -7435,9 +7679,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7452,27 +7717,49 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; - return kvm_unmap_gfn_range(kvm, range); -} + if (WARN_ON_ONCE(range->end <= range->start)) + return false; -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + + /* Unmap the old attribute page. */ + if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) + range->attr_filter = KVM_FILTER_SHARED; + else + range->attr_filter = KVM_FILTER_PRIVATE; + + return kvm_unmap_gfn_range(kvm, range); } + + static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index b00abbe3f6cf..db8f33e4de62 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -6,6 +6,8 @@ #include <linux/kvm_host.h> #include <asm/kvm_host.h> +#include "mmu.h" + #ifdef CONFIG_KVM_PROVE_MMU #define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x) #else @@ -101,7 +103,22 @@ struct kvm_mmu_page { int root_count; refcount_t tdp_mmu_root_count; }; - unsigned int unsync_children; + union { + /* These two members aren't used for TDP MMU */ + struct { + unsigned int unsync_children; + /* + * Number of writes since the last time traversal + * visited this page. + */ + atomic_t write_flooding_count; + }; + /* + * Page table page of external PT. + * Passed to TDX module, not accessed by KVM. + */ + void *external_spt; + }; union { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ tdp_ptep_t ptep; @@ -124,9 +141,6 @@ struct kvm_mmu_page { int clear_spte_count; #endif - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; - #ifdef CONFIG_X86_64 /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; @@ -145,7 +159,36 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) return kvm_mmu_role_as_id(sp->role); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool is_mirror_sp(const struct kvm_mmu_page *sp) +{ + return sp->role.is_mirror; +} + +static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + /* + * external_spt is allocated for TDX module to hold private EPT mappings, + * TDX module will initialize the page by itself. + * Therefore, KVM does not need to initialize or access external_spt. + * KVM only interacts with sp->spt for private EPT operations. + */ + sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); +} + +static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root) +{ + /* + * Since mirror SPs are used only for TDX, which maps private memory + * at its "natural" GFN, no mask needs to be applied to them - and, dually, + * we expect that the bits is only used for the shared PT. + */ + if (is_mirror_sp(root)) + return 0; + return kvm_gfn_direct_bits(kvm); +} + +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -155,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) @@ -229,7 +272,12 @@ struct kvm_page_fault { */ u8 goal_level; - /* Shifted addr, or result of guest page table walk if addr is a gva. */ + /* + * Shifted addr, or result of guest page table walk if addr is a gva. In + * the case of VM where memslot's can be mapped at multiple GPA aliases + * (i.e. TDX), the gfn field does not contain the bit that selects between + * the aliases (i.e. the shared bit for TDX). + */ gfn_t gfn; /* The memslot containing gfn. May be NULL. */ @@ -268,9 +316,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h * * Note, all values must be greater than or equal to zero so as not to encroach - * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which - * will allow for efficient machine code when checking for CONTINUE, e.g. - * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero. + * on -errno return values. */ enum { RET_PF_CONTINUE = 0, @@ -282,6 +328,14 @@ enum { RET_PF_SPURIOUS, }; +/* + * Define RET_PF_CONTINUE as 0 to allow for + * - efficient machine code when checking for CONTINUE, e.g. + * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero, + * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value. + */ +static_assert(RET_PF_CONTINUE == 0); + static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -317,10 +371,19 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; if (vcpu->arch.mmu->root_role.direct) { - fault.gfn = fault.addr >> PAGE_SHIFT; + /* + * Things like memslots don't understand the concept of a shared + * bit. Strip it so that the GFN can be used like normal, and the + * fault.addr can be used when the shared bit is needed. + */ + fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm); fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); } + /* + * With retpoline being active an indirect call is rather expensive, + * so do a direct call in the most common case. + */ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 561c331fd6ec..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm) struct kvm_memory_slot *slot; int r = 0, i, bkt; + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + mutex_lock(&kvm->slots_arch_lock); /* diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f4711674c47b..68e323568e95 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -510,8 +510,7 @@ error: * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ - walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << - EPT_VIOLATION_RWX_SHIFT; + walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 22551e2f1d00..cfce03d8f123 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; -u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -96,8 +95,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); - access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; @@ -129,25 +126,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } /* - * Returns true if the SPTE has bits that may be set without holding mmu_lock. - * The caller is responsible for checking if the SPTE is shadow-present, and - * for determining whether or not the caller cares about non-leaf SPTEs. + * Returns true if the SPTE needs to be updated atomically due to having bits + * that may be changed without holding mmu_lock, and for which KVM must not + * lose information. E.g. KVM must not drop Dirty bit information. The caller + * is responsible for checking if the SPTE is shadow-present, and for + * determining whether or not the caller cares about non-leaf SPTEs. */ -bool spte_has_volatile_bits(u64 spte) +bool spte_needs_atomic_update(u64 spte) { + /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; - if (is_access_track_spte(spte)) + /* + * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked + * SPTEs can be restored by KVM's fast page fault handler. + */ + if (!spte_ad_enabled(spte)) return true; - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; + /* + * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed + * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't + * invalidate TLBs when aging SPTEs, and so it's safe to clobber the + * Accessed bit (and rare in practice). + */ + return is_writable_pte(spte) && !(spte & shadow_dirty_mask); } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -170,7 +174,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; - else if (kvm_mmu_page_ad_need_write_protect(sp)) + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; @@ -205,9 +209,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (shadow_memtype_mask) - spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -433,6 +435,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ @@ -456,13 +464,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; - /* - * EPT overrides the host MTRRs, and so KVM must program the desired - * memtype directly into the SPTEs. Note, this mask is just the mask - * of all bits that factor into the memtype, the actual memtype must be - * dynamically calculated, e.g. to ensure host MMIO is mapped UC. - */ - shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -514,12 +516,6 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; - /* - * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB - * memtype in the SPTEs, i.e. relies on host MTRRs to provide the - * correct memtype (WB is the "weakest" memtype). - */ - shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index af10bc0380a3..1e94f081bdaf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; -extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -276,6 +275,11 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root) return spte_to_child_sp(root); } +static inline bool is_mirror_sptep(tdp_ptep_t sptep) +{ + return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep))); +} + static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && @@ -514,7 +518,7 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -bool spte_has_volatile_bits(u64 spte); +bool spte_needs_atomic_update(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 04c247bfe318..9e17bfa80901 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -37,15 +37,17 @@ void tdp_iter_restart(struct tdp_iter *iter) * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, - int min_level, gfn_t next_last_level_gfn) + int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits) { if (WARN_ON_ONCE(!root || (root->role.level < 1) || - (root->role.level > PT64_ROOT_MAX_LEVEL))) { + (root->role.level > PT64_ROOT_MAX_LEVEL) || + (gfn_bits && next_last_level_gfn >= gfn_bits))) { iter->valid = false; return; } iter->next_last_level_gfn = next_last_level_gfn; + iter->gfn_bits = gfn_bits; iter->root_level = root->role.level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; @@ -113,7 +115,7 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) == (SPTE_ENT_PER_PAGE - 1)) return false; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 2880fd392e0c..364c5da6c499 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) return xchg(rcu_dereference(sptep), new_spte); } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); @@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) } /* - * SPTEs must be modified atomically if they are shadow-present, leaf - * SPTEs, and have volatile bits, i.e. has bits that can be set outside - * of mmu_lock. The Writable bit can be set by KVM's fast page fault - * handler, and Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs, + * and have volatile bits (bits that can be set outside of mmu_lock) that + * must not be clobbered. */ -static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level) { return is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte); + spte_needs_atomic_update(old_spte); } static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); @@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level) { - atomic64_t *sptep_atomic; - - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { - sptep_atomic = (atomic64_t *)rcu_dereference(sptep); - return (u64)atomic64_fetch_and(~mask, sptep_atomic); - } + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); return old_spte; @@ -93,8 +89,10 @@ struct tdp_iter { tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ tdp_ptep_t sptep; - /* The lowest GFN mapped by the current SPTE */ + /* The lowest GFN (mask bits excluded) mapped by the current SPTE */ gfn_t gfn; + /* Mask applied to convert the GFN to the mapping GPA */ + gfn_t gfn_bits; /* The level of the root page given to the iterator */ int root_level; /* The lowest level the iterator should traverse to */ @@ -122,18 +120,23 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \ - for (tdp_iter_start(&iter, root, min_level, start); \ - iter.valid && iter.gfn < end; \ +#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \ + for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \ + iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) -#define for_each_tdp_pte(iter, root, start, end) \ - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) +#define for_each_tdp_pte_min_level_all(iter, root, min_level) \ + for (tdp_iter_start(&iter, root, min_level, 0, 0); \ + iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, kvm, root, start, end) \ + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, - int min_level, gfn_t next_last_level_gfn); + int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 2f15e0e33903..7f3d7229b2c1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -37,10 +37,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) * for zapping and thus puts the TDP MMU's reference to each root, i.e. * ultimately frees all roots. */ - kvm_tdp_mmu_invalidate_all_roots(kvm); - kvm_tdp_mmu_zap_invalidated_roots(kvm); + kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); + kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -53,6 +55,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { + free_page((unsigned long)sp->external_spt); free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } @@ -91,19 +94,33 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } +static bool tdp_mmu_root_match(struct kvm_mmu_page *root, + enum kvm_tdp_mmu_root_types types) +{ + if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS))) + return false; + + if (root->role.invalid && !(types & KVM_INVALID_ROOTS)) + return false; + + if (likely(!is_mirror_sp(root))) + return types & KVM_DIRECT_ROOTS; + return types & KVM_MIRROR_ROOTS; +} + /* * Returns the next root after @prev_root (or the first root if @prev_root is - * NULL). A reference to the returned root is acquired, and the reference to - * @prev_root is released (the caller obviously must hold a reference to - * @prev_root if it's non-NULL). + * NULL) that matches with @types. A reference to the returned root is + * acquired, and the reference to @prev_root is released (the caller obviously + * must hold a reference to @prev_root if it's non-NULL). * - * If @only_valid is true, invalid roots are skipped. + * Roots that doesn't match with @types are skipped. * * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool only_valid) + enum kvm_tdp_mmu_root_types types) { struct kvm_mmu_page *next_root; @@ -124,7 +141,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, typeof(*next_root), link); while (next_root) { - if ((!only_valid || !next_root->role.invalid) && + if (tdp_mmu_root_match(next_root, types) && kvm_tdp_mmu_get_root(next_root)) break; @@ -149,20 +166,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * If shared is set, this function is operating under the MMU lock in read * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ + _root = tdp_mmu_next_root(_kvm, _root, _types)) \ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true) + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS) #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ + for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, false)) + _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS)) /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, @@ -171,18 +188,28 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ -#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \ +#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ - ((_only_valid) && (_root)->role.invalid))) { \ + !tdp_mmu_root_match((_root), (_types)))) { \ } else -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root(_kvm, _root, _as_id, false) +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ + } else #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root(_kvm, _root, _as_id, true) + __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { @@ -223,7 +250,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } -int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) +void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) { struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role role = mmu->root_role; @@ -231,6 +258,9 @@ int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; + if (mirror) + role.is_mirror = true; + /* * Check for an existing root before acquiring the pages lock to avoid * unnecessary serialization if multiple vCPUs are loading a new root. @@ -282,9 +312,12 @@ out_read_unlock: * and actually consuming the root if it's invalidated after dropping * mmu_lock, and the root can't be freed as this vCPU holds a reference. */ - mmu->root.hpa = __pa(root->spt); - mmu->root.pgd = 0; - return 0; + if (mirror) { + mmu->mirror_root_hpa = __pa(root->spt); + } else { + mmu->root.hpa = __pa(root->spt); + mmu->root.pgd = 0; + } } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -294,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** @@ -322,6 +359,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } +static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, + int level) +{ + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); + int ret; + + /* + * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external + * PTs are removed in a special order, involving free_external_spt(). + * But remove_external_spte() will be called on non-leaf PTEs via + * __tdp_mmu_zap_root(), so avoid the error the former would return + * in this case. + */ + if (!is_last_spte(old_spte, level)) + return; + + /* Zapping leaf spte is allowed only when write lock is held. */ + lockdep_assert_held_write(&kvm->mmu_lock); + /* Because write lock is held, operation should success. */ + ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); + KVM_BUG_ON(ret, kvm); +} + /** * handle_removed_pt() - handle a page table removed from the TDP structure * @@ -417,11 +477,81 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, old_spte, FROZEN_SPTE, level, shared); + + if (is_mirror_sp(sp)) { + KVM_BUG_ON(shared, kvm); + remove_external_spte(kvm, gfn, old_spte, level); + } + } + + if (is_mirror_sp(sp) && + WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, + sp->external_spt))) { + /* + * Failed to free page table page in mirror page table and + * there is nothing to do further. + * Intentionally leak the page to prevent the kernel from + * accessing the encrypted page. + */ + sp->external_spt = NULL; } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } +static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) +{ + if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); + + WARN_ON_ONCE(sp->role.level + 1 != level); + WARN_ON_ONCE(sp->gfn != gfn); + return sp->external_spt; + } + + return NULL; +} + +static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, + gfn_t gfn, u64 old_spte, + u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool is_leaf = is_present && is_last_spte(new_spte, level); + kvm_pfn_t new_pfn = spte_to_pfn(new_spte); + int ret = 0; + + KVM_BUG_ON(was_present, kvm); + + lockdep_assert_held(&kvm->mmu_lock); + /* + * We need to lock out other updates to the SPTE until the external + * page table has been modified. Use FROZEN_SPTE similar to + * the zapping case. + */ + if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) + return -EBUSY; + + /* + * Use different call to either set up middle level + * external page table, or leaf. + */ + if (is_leaf) { + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + } else { + void *external_spt = get_external_spt(gfn, new_spte, level); + + KVM_BUG_ON(!external_spt, kvm); + ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); + } + if (ret) + __kvm_tdp_mmu_write_spte(sptep, old_spte); + else + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return ret; +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -522,11 +652,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); } -static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, +static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, u64 new_spte) { - u64 *sptep = rcu_dereference(iter->sptep); - /* * The caller is responsible for ensuring the old SPTE is not a FROZEN * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, @@ -535,15 +664,34 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, */ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); - /* - * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. On failure, i.e. if a different logical - * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with - * the current value, so the caller operates on fresh data, e.g. if it - * retries tdp_mmu_set_spte_atomic() - */ - if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) - return -EBUSY; + if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { + int ret; + + /* + * Users of atomic zapping don't operate on mirror roots, + * so don't handle it and bug the VM if it's seen. + */ + if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) + return -EBUSY; + + ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, + iter->old_spte, new_spte, iter->level); + if (ret) + return ret; + } else { + u64 *sptep = rcu_dereference(iter->sptep); + + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs + * and does not hold the mmu_lock. On failure, i.e. if a + * different logical CPU modified the SPTE, try_cmpxchg64() + * updates iter->old_spte with the current value, so the caller + * operates on fresh data, e.g. if it retries + * tdp_mmu_set_spte_atomic() + */ + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) + return -EBUSY; + } return 0; } @@ -573,7 +721,7 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - ret = __tdp_mmu_set_spte_atomic(iter, new_spte); + ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); if (ret) return ret; @@ -613,6 +761,16 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); + + /* + * Users that do non-atomic setting of PTEs don't operate on mirror + * roots, so don't handle it and bug the VM if it's seen. + */ + if (is_mirror_sptep(sptep)) { + KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); + remove_external_spte(kvm, gfn, old_spte, level); + } + return old_spte; } @@ -625,19 +783,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, iter->gfn, iter->level); } -#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ - for_each_tdp_pte(_iter, _root, _start, _end) +#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ + for_each_tdp_pte(_iter, _kvm, _root, _start, _end) -#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ - tdp_root_for_each_pte(_iter, _root, _start, _end) \ +#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ if (!is_shadow_present_pte(_iter.old_spte) || \ !is_last_spte(_iter.old_spte, _iter.level)) \ continue; \ else -#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) - static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, struct tdp_iter *iter) { @@ -705,10 +860,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - gfn_t end = tdp_mmu_max_gfn_exclusive(); - gfn_t start = 0; - - for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { + for_each_tdp_pte_min_level_all(iter, root, zap_level) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; @@ -812,7 +964,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; @@ -863,19 +1015,21 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) struct kvm_mmu_page *root; /* - * Zap all roots, including invalid roots, as all SPTEs must be dropped - * before returning to the caller. Zap directly even if the root is - * also being zapped by a worker. Walking zapped top-level SPTEs isn't - * all that expensive and mmu_lock is already held, which means the - * worker has yielded, i.e. flushing the work instead of zapping here - * isn't guaranteed to be any faster. + * Zap all direct roots, including invalid direct roots, as all direct + * SPTEs must be dropped before returning to the caller. For TDX, mirror + * roots don't need handling in response to the mmu notifier (the caller). + * + * Zap directly even if the root is also being zapped by a concurrent + * "fast zap". Walking zapped top-level SPTEs isn't all that expensive + * and mmu_lock is already held, which means the other thread has yielded. * * A TLB flush is unnecessary, KVM zaps everything if and only the VM * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root) + __for_each_tdp_mmu_root_yield_safe(kvm, root, -1, + KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS) tdp_mmu_zap_root(kvm, root, false); } @@ -883,11 +1037,14 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast * zap" completes. */ -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared) { struct kvm_mmu_page *root; - read_lock(&kvm->mmu_lock); + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root) { if (!root->tdp_mmu_scheduled_root_to_zap) @@ -905,7 +1062,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * that may be zapped, as such entries are associated with the * ASID on both VMX and SVM. */ - tdp_mmu_zap_root(kvm, root, true); + tdp_mmu_zap_root(kvm, root, shared); /* * The referenced needs to be put *after* zapping the root, as @@ -915,7 +1072,10 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) kvm_tdp_mmu_put_root(kvm, root); } - read_unlock(&kvm->mmu_lock); + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); } /* @@ -928,11 +1088,19 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_alloc_root(). */ -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, + enum kvm_tdp_mmu_root_types root_types) { struct kvm_mmu_page *root; /* + * Invalidating invalid roots doesn't make sense, prevent developers from + * having to think about it. + */ + if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS)) + root_types &= ~KVM_INVALID_ROOTS; + + /* * mmu_lock must be held for write to ensure that a root doesn't become * invalid while there are active readers (invalidating a root while * there are active readers may or may not be problematic in practice, @@ -953,6 +1121,9 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) * or get/put references to roots. */ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + if (!tdp_mmu_root_match(root, root_types)) + continue; + /* * Note, invalid roots can outlive a memslot update! Invalid * roots must be *zapped* before the memslot update completes, @@ -982,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; - if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) - return RET_PF_SPURIOUS; - if (is_shadow_present_pte(iter->old_spte) && - is_access_allowed(fault, iter->old_spte) && - is_last_spte(iter->old_spte, iter->level)) + (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + is_last_spte(iter->old_spte, iter->level)) { + WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); return RET_PF_SPURIOUS; + } if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); @@ -1068,7 +1238,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, */ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault); struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; @@ -1080,7 +1250,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { int r; if (fault->nx_huge_page_workaround_enabled) @@ -1107,13 +1277,18 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) */ sp = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_child_sp(sp, &iter); + if (is_mirror_sp(sp)) + kvm_mmu_alloc_external_spt(vcpu, sp); sp->nx_huge_page_disallowed = fault->huge_page_disallowed; - if (is_shadow_present_pte(iter.old_spte)) + if (is_shadow_present_pte(iter.old_spte)) { + /* Don't support large page for mirrored roots (TDX) */ + KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); - else + } else { r = tdp_mmu_link_sp(kvm, &iter, sp, true); + } /* * Force the guest to retry if installing an upper level SPTE @@ -1148,12 +1323,16 @@ retry: return ret; } +/* Used by mmu notifier via kvm_unmap_gfn_range() */ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { + enum kvm_tdp_mmu_root_types types; struct kvm_mmu_page *root; - __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; + + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, range->may_block, flush); @@ -1168,21 +1347,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; if (spte_ad_enabled(iter->old_spte)) { - iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, - iter->old_spte, - shadow_accessed_mask, - iter->level); + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { new_spte = mark_spte_for_access_track(iter->old_spte); - iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, - iter->old_spte, new_spte, - iter->level); + /* + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. + */ + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, @@ -1193,20 +1373,24 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool test_only) { + enum kvm_tdp_mmu_root_types types; struct kvm_mmu_page *root; struct tdp_iter iter; bool ret = false; + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter); + /* * Don't support rescheduling, none of the MMU notifiers that funnel * into this helper allow blocking; it'd be dead, wasteful code. Note, * this helper must NOT be used to unmap GFNs, as it processes only * valid roots! */ - for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) { - guard(rcu)(); + WARN_ON(types & ~KVM_VALID_ROOTS); - tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { + tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { if (!is_accessed_spte(iter.old_spte)) continue; @@ -1214,7 +1398,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, return true; ret = true; - kvm_tdp_mmu_age_spte(&iter); + kvm_tdp_mmu_age_spte(kvm, &iter); } } @@ -1247,7 +1431,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); - for_each_tdp_pte_min_level(iter, root, min_level, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1366,7 +1550,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, * level above the target level (e.g. splitting a 1GB to 512 2MB pages, * and then splitting each of those to 512 4KB pages). */ - for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; @@ -1445,26 +1629,26 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * All TDP MMU shadow pages share the same role as their root, aside * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; } static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + tdp_root_for_each_pte(iter, kvm, root, start, end) { retry: if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1504,15 +1688,15 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) break; @@ -1566,7 +1750,7 @@ static int tdp_mmu_make_huge_spte(struct kvm *kvm, gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); struct tdp_iter iter; - tdp_root_for_each_leaf_pte(iter, root, start, end) { + tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) { /* * Use the parent iterator when checking for forward progress so * that KVM doesn't get stuck continuously trying to yield (i.e. @@ -1600,7 +1784,7 @@ static void recover_huge_pages_range(struct kvm *kvm, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { flush = false; @@ -1681,7 +1865,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; @@ -1726,17 +1910,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { struct tdp_iter iter; - struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } @@ -1744,6 +1925,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no @@ -1758,11 +1969,12 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { + /* Fast pf is not supported for mirrored roots */ + struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS); struct tdp_iter iter; - struct kvm_mmu *mmu = vcpu->arch.mmu; tdp_ptep_t sptep = NULL; - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index f03ca0dd13d9..52acf99d40a0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -10,7 +10,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { @@ -19,11 +19,56 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); +enum kvm_tdp_mmu_root_types { + KVM_INVALID_ROOTS = BIT(0), + KVM_DIRECT_ROOTS = BIT(1), + KVM_MIRROR_ROOTS = BIT(2), + KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS, + KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS, +}; + +static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm, + enum kvm_gfn_range_filter process) +{ + enum kvm_tdp_mmu_root_types ret = 0; + + if (!kvm_has_mirrored_tdp(kvm)) + return KVM_DIRECT_ROOTS; + + if (process & KVM_FILTER_PRIVATE) + ret |= KVM_MIRROR_ROOTS; + if (process & KVM_FILTER_SHARED) + ret |= KVM_DIRECT_ROOTS; + + WARN_ON_ONCE(!ret); + + return ret; +} + +static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr))) + return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); + + return root_to_sp(vcpu->arch.mmu->root.hpa); +} + +static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, + enum kvm_tdp_mmu_root_types type) +{ + if (unlikely(type == KVM_MIRROR_ROOTS)) + return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); + + return root_to_sp(vcpu->arch.mmu->root.hpa); +} + bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, + enum kvm_tdp_mmu_root_types root_types); +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 47a46283c866..75e9cfc689f8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -797,7 +797,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); kvm_pmu_call(init)(vcpu); - kvm_pmu_refresh(vcpu); } /* Release perf_events for vPMCs that have been unused for a full time slice. */ diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index e46220ece83c..fde0ae986003 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -7,23 +7,6 @@ #include <asm/cpufeatures.h> /* - * Hardware-defined CPUID leafs that are either scattered by the kernel or are - * unknown to the kernel, but need to be directly used by KVM. Note, these - * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. - */ -enum kvm_only_cpuid_leafs { - CPUID_12_EAX = NCAPINTS, - CPUID_7_1_EDX, - CPUID_8000_0007_EDX, - CPUID_8000_0022_EAX, - CPUID_7_2_EDX, - CPUID_24_0_EBX, - NR_KVM_CPU_CAPS, - - NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, -}; - -/* * Define a KVM-only feature flag. * * For features that are scattered by cpufeatures.h, __feature_translate() also @@ -145,7 +128,10 @@ static __always_inline u32 __feature_translate(int x86_feature) static __always_inline u32 __feature_leaf(int x86_feature) { - return __feature_translate(x86_feature) / 32; + u32 x86_leaf = __feature_translate(x86_feature) / 32; + + reverse_cpuid_check(x86_leaf); + return x86_leaf; } /* @@ -168,7 +154,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_featu { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 85241c0c7f56..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { @@ -283,7 +284,7 @@ void enter_smm(struct kvm_vcpu *vcpu) memset(smram.bytes, 0, sizeof(smram.bytes)); #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, &smram.smram64); else #endif @@ -353,12 +354,12 @@ void enter_smm(struct kvm_vcpu *vcpu) kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) if (kvm_x86_call(set_efer)(vcpu, 0)) goto error; #endif - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; kvm_mmu_reset_context(vcpu); return; error: @@ -586,7 +587,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) * supports long mode. */ #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) { struct kvm_segment cs_desc; unsigned long cr4; @@ -609,7 +610,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) { unsigned long cr4, efer; /* Clear CR4.PAE before clearing EFER.LME. */ @@ -634,7 +635,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) ret = rsm_load_state_64(ctxt, &smram.smram64); else #endif diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index a1cf2ac5bd78..551703fbe200 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -142,6 +142,9 @@ union kvm_smram { static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { + if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE)) + return -ENOTTY; + kvm_make_request(KVM_REQ_SMI, vcpu); return 0; } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 65fd245a9953..067f8e3f5a0d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -330,7 +331,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -796,12 +797,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +824,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +900,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +937,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +957,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +972,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b708bdf7eaff..8427a48b8b7a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -111,7 +111,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) + if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -594,7 +594,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with @@ -646,12 +646,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else @@ -673,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -689,7 +721,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); @@ -710,7 +742,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -720,7 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -728,18 +760,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_LBRV)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) pause_count12 = svm->nested.ctl.pause_filter_count; else pause_count12 = 0; - if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) pause_thresh12 = svm->nested.ctl.pause_filter_thresh; else pause_thresh12 = 0; @@ -762,11 +794,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -994,7 +1021,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ @@ -1026,7 +1053,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1039,8 +1066,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1065,7 +1101,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 22d5a65b410c..288f7f2a46f2 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -46,7 +46,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, switch (msr) { case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) return NULL; /* * Each PMU counter has a pair of CTL and CTR MSRs. CTLn @@ -109,7 +109,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: return pmu->version > 0; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -179,7 +179,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid_0x80000022_ebx ebx; pmu->version = 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) { pmu->version = 2; /* * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest @@ -189,7 +189,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; - } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { + } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 943bd074a5d3..459c3b791fd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -26,6 +26,7 @@ #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> #include <asm/sev.h> #include "mmu.h" @@ -140,7 +141,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm) static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } @@ -226,9 +227,7 @@ e_uncharge: static unsigned int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; + return to_kvm_sev_info(kvm)->asid; } static void sev_asid_free(struct kvm_sev_info *sev) @@ -403,7 +402,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_init *data, unsigned long vm_type) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; bool es_active = vm_type != KVM_X86_SEV_VM; u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; @@ -500,10 +499,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_init data; - if (!sev->need_init) + if (!to_kvm_sev_info(kvm)->need_init) return -EINVAL; if (kvm->arch.vm_type != KVM_X86_SEV_VM && @@ -543,14 +541,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error) static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -563,6 +561,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -622,9 +622,9 @@ e_free_dh: static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); unsigned long npages, size; int npinned; unsigned long locked, lock_limit; @@ -663,7 +663,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -686,11 +686,9 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unpin_user_pages(pages, npages); kvfree(pages); - sev->pages_locked -= npages; + to_kvm_sev_info(kvm)->pages_locked -= npages; } static void sev_clflush_pages(struct page *pages[], unsigned long npages) @@ -734,7 +732,6 @@ static unsigned long get_num_contig_pages(unsigned long idx, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data data; struct page **inpages; @@ -751,7 +748,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -762,7 +759,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_clflush_pages(inpages, npages); data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -802,7 +799,7 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); struct sev_es_save_area *save = svm->sev_es.vmsa; struct xregs_state *xsave; const u8 *s; @@ -972,7 +969,6 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *measure = u64_to_user_ptr(argp->data); - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; @@ -1005,7 +1001,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* @@ -1033,19 +1029,17 @@ e_free_blob: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status data; int ret; @@ -1055,7 +1049,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) return ret; @@ -1074,11 +1068,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg data; data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; data.dst_addr = dst; data.src_addr = src; data.len = size; @@ -1250,7 +1243,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1302,7 +1295,6 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; @@ -1316,7 +1308,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1358,7 +1350,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) data.hdr_address = __psp_pa(hdr); data.hdr_len = params.hdr_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); @@ -1378,7 +1370,6 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *report = u64_to_user_ptr(argp->data); - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; @@ -1411,7 +1402,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -1441,12 +1432,11 @@ static int __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_start *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; @@ -1459,7 +1449,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; struct kvm_sev_send_start params; void *amd_certs, *session_data; @@ -1520,7 +1509,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) data.amd_certs_len = params.amd_certs_len; data.session_address = __psp_pa(session_data); data.session_len = params.session_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); @@ -1552,12 +1541,11 @@ static int __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_update_data *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); params->hdr_len = data.hdr_len; @@ -1572,7 +1560,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; @@ -1608,11 +1595,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1626,7 +1613,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); @@ -1657,31 +1644,29 @@ e_unpin: static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); } static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_cancel data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); } static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_receive_start start; struct kvm_sev_receive_start params; int *error = &argp->error; @@ -1755,7 +1740,6 @@ e_free_pdh: static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; @@ -1798,7 +1782,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1815,7 +1799,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); @@ -1832,13 +1816,12 @@ e_free_hdr: static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_receive_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } @@ -1858,8 +1841,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); int r = -EBUSY; if (dst_kvm == src_kvm) @@ -1893,8 +1876,8 @@ release_dst: static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); mutex_unlock(&dst_kvm->lock); mutex_unlock(&src_kvm->lock); @@ -1902,74 +1885,10 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - -#ifdef CONFIG_PROVE_LOCKING - if (!i) - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - else - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); -#endif - } - - return 0; - -out_unlock: - - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - -#ifdef CONFIG_PROVE_LOCKING - if (j) - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); -#endif - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; struct kvm_sev_info *mirror; @@ -2009,8 +1928,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * and add the new mirror to the list. */ if (is_mirroring_enc_context(dst_kvm)) { - struct kvm_sev_info *owner_sev_info = - &to_kvm_svm(dst->enc_context_owner)->sev_info; + struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); list_del(&src->mirror_entry); list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); @@ -2069,7 +1987,7 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); struct kvm_sev_info *src_sev, *cg_cleanup_sev; CLASS(fd, f)(source_fd); struct kvm *source_kvm; @@ -2093,7 +2011,7 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) goto out_unlock; } - src_sev = &to_kvm_svm(source_kvm)->sev_info; + src_sev = to_kvm_sev_info(source_kvm); dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; @@ -2104,10 +2022,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -2121,9 +2039,9 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -2181,7 +2099,7 @@ static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) static int snp_bind_asid(struct kvm *kvm, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_activate data = {0}; data.gctx_paddr = __psp_pa(sev->snp_context); @@ -2191,7 +2109,7 @@ static int snp_bind_asid(struct kvm *kvm, int *error) static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_launch_start start = {0}; struct kvm_sev_snp_launch_start params; int rc; @@ -2220,6 +2138,8 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) return -EINVAL; + sev->policy = params.policy; + sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) return -ENOTTY; @@ -2260,7 +2180,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf void __user *src, int order, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); int n_private = 0, ret, i; int npages = (1 << order); gfn_t gfn; @@ -2350,7 +2270,7 @@ err: static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_gmem_populate_args sev_populate_args = {0}; struct kvm_sev_snp_launch_update params; struct kvm_memory_slot *memslot; @@ -2434,7 +2354,7 @@ out: static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_launch_update data = {}; struct kvm_vcpu *vcpu; unsigned long i; @@ -2482,7 +2402,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct kvm_sev_snp_launch_finish params; struct sev_data_snp_launch_finish *data; void *id_block = NULL, *id_auth = NULL; @@ -2677,7 +2597,7 @@ out: int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0; @@ -2696,7 +2616,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); @@ -2729,7 +2650,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -2824,9 +2745,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ - source_sev = &to_kvm_svm(source_kvm)->sev_info; + source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2854,7 +2775,7 @@ e_unlock: static int snp_decommission_context(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_addr data = {}; int ret; @@ -2879,7 +2800,7 @@ static int snp_decommission_context(struct kvm *kvm) void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -2950,9 +2871,37 @@ void __init sev_set_cpu_caps(void) } } +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; +} + void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2972,6 +2921,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); @@ -3043,6 +3002,14 @@ void __init sev_hardware_setup(void) sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : @@ -3051,11 +3018,11 @@ out: min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - sev_es_supported ? "enabled" : "disabled", + str_enabled_disabled(sev_es_supported), min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); if (boot_cpu_has(X86_FEATURE_SEV_SNP)) pr_info("SEV-SNP %s (ASIDs %u - %u)\n", - sev_snp_supported ? "enabled" : "disabled", + str_enabled_disabled(sev_snp_supported), min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; @@ -3084,6 +3051,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -3129,7 +3098,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -3183,9 +3152,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3194,18 +3168,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3261,7 +3241,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -3276,11 +3256,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3420,8 +3395,7 @@ vmgexit_err: dump_ghcb(svm); } - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); + svm_vmgexit_bad_input(svm, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -3462,10 +3436,19 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) svm->sev_es.ghcb = NULL; } -void pre_sev_run(struct vcpu_svm *svm, int cpu) +int pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - unsigned int asid = sev_get_asid(svm->vcpu.kvm); + struct kvm *kvm = svm->vcpu.kvm; + unsigned int asid = sev_get_asid(kvm); + + /* + * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid + * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP + * AP Destroy event. + */ + if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) + return -EINVAL; /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -3478,11 +3461,12 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) */ if (sd->sev_vmcbs[asid] == svm->vmcb && svm->vcpu.arch.last_vmentry_cpu == cpu) - return; + return 0; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + return 0; } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) @@ -3564,8 +3548,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -3627,13 +3610,20 @@ static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) return 1; /* resume guest */ } - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); return 1; /* resume guest */ } vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = 1; vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) @@ -3658,7 +3648,14 @@ static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) svm->sev_es.psc_inflight = 0; svm->sev_es.psc_idx = 0; svm->sev_es.psc_2m = false; - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret); + + /* + * PSC requests always get a "no action" response in SW_EXITINFO1, with + * a PSC-specific return code in SW_EXITINFO2 that provides the "real" + * return code. E.g. if the PSC request was interrupted, the need to + * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. + */ + svm_vmgexit_no_action(svm, psc_ret); } static void __snp_complete_one_psc(struct vcpu_svm *svm) @@ -3710,7 +3707,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) bool huge; u64 gfn; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1; } @@ -3797,6 +3794,13 @@ next_range: case VMGEXIT_PSC_OP_SHARED: vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE @@ -3820,113 +3824,93 @@ next_range: goto next_range; } - unreachable(); + BUG(); } -static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_memory_slot *slot; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!sev_snp_guest(vcpu->kvm)) + return; - WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); + guard(mutex)(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + return; + + svm->sev_es.snp_ap_waiting_for_reset = false; /* Mark the vCPU as offline and not runnable */ vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); /* Clear use of the VMSA */ svm->vmcb->control.vmsa_pa = INVALID_PAGE; - if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { - gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); - struct kvm_memory_slot *slot; - struct page *page; - kvm_pfn_t pfn; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot) - return -EINVAL; - - /* - * The new VMSA will be private memory guest memory, so - * retrieve the PFN from the gmem backend. - */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) - return -EINVAL; - - /* - * From this point forward, the VMSA will always be a - * guest-mapped page rather than the initial one allocated - * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa - * could be free'd and cleaned up here, but that involves - * cleanups like wbinvd_on_all_cpus() which would ideally - * be handled during teardown rather than guest boot. - * Deferring that also allows the existing logic for SEV-ES - * VMSAs to be re-used with minimal SNP-specific changes. - */ - svm->sev_es.snp_has_guest_vmsa = true; - - /* Use the new VMSA */ - svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); - - /* Mark the vCPU as runnable */ - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - - svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - - /* - * gmem pages aren't currently migratable, but if this ever - * changes then care should be taken to ensure - * svm->sev_es.vmsa is pinned through some other means. - */ - kvm_release_page_clean(page); - } - /* * When replacing the VMSA during SEV-SNP AP creation, * mark the VMCB dirty so that full state is always reloaded. */ vmcb_mark_all_dirty(svm->vmcb); - return 0; -} + if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) + return; -/* - * Invoked as part of svm_vcpu_reset() processing of an init event. - */ -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int ret; + gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - if (!sev_snp_guest(vcpu->kvm)) + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) return; - mutex_lock(&svm->sev_es.snp_vmsa_mutex); + /* + * The new VMSA will be private memory guest memory, so retrieve the + * PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) + return; - if (!svm->sev_es.snp_ap_waiting_for_reset) - goto unlock; + /* + * From this point forward, the VMSA will always be a guest-mapped page + * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In + * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but + * that involves cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. Deferring that + * also allows the existing logic for SEV-ES VMSAs to be re-used with + * minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; - svm->sev_es.snp_ap_waiting_for_reset = false; + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); - ret = __sev_snp_update_protected_guest_state(vcpu); - if (ret) - vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); + /* Mark the vCPU as runnable */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); -unlock: - mutex_unlock(&svm->sev_es.snp_vmsa_mutex); + /* + * gmem pages aren't currently migratable, but if this ever changes + * then care should be taken to ensure svm->sev_es.vmsa is pinned + * through some other means. + */ + kvm_release_page_clean(page); } static int sev_snp_ap_creation(struct vcpu_svm *svm) { - struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct kvm_vcpu *vcpu = &svm->vcpu; struct kvm_vcpu *target_vcpu; struct vcpu_svm *target_svm; unsigned int request; unsigned int apic_id; - bool kick; - int ret; request = lower_32_bits(svm->vmcb->control.exit_info_1); apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); @@ -3939,47 +3923,23 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) return -EINVAL; } - ret = 0; - target_svm = to_svm(target_vcpu); - /* - * The target vCPU is valid, so the vCPU will be kicked unless the - * request is for CREATE_ON_INIT. For any errors at this stage, the - * kick will place the vCPU in an non-runnable state. - */ - kick = true; - - mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); - - target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - target_svm->sev_es.snp_ap_waiting_for_reset = true; - - /* Interrupt injection mode shouldn't change for AP creation */ - if (request < SVM_VMGEXIT_AP_DESTROY) { - u64 sev_features; - - sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; - sev_features ^= sev->vmsa_features; - - if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { - vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", - vcpu->arch.regs[VCPU_REGS_RAX]); - ret = -EINVAL; - goto out; - } - } + guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); switch (request) { case SVM_VMGEXIT_AP_CREATE_ON_INIT: - kick = false; - fallthrough; case SVM_VMGEXIT_AP_CREATE: + if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { + vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); + return -EINVAL; + } + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", svm->vmcb->control.exit_info_2); - ret = -EINVAL; - goto out; + return -EINVAL; } /* @@ -3993,30 +3953,30 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) vcpu_unimpl(vcpu, "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", svm->vmcb->control.exit_info_2); - ret = -EINVAL; - goto out; + return -EINVAL; } target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; break; case SVM_VMGEXIT_AP_DESTROY: + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; break; default: vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", request); - ret = -EINVAL; - break; + return -EINVAL; } -out: - if (kick) { - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } + target_svm->sev_es.snp_ap_waiting_for_reset = true; - mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); + /* + * Unless Creation is deferred until INIT, signal the vCPU to update + * its state. + */ + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - return ret; + return 0; } static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) @@ -4055,7 +4015,8 @@ static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_ goto out_unlock; } - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err)); + /* No action is requested *from KVM* if there was a firmware error. */ + svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); ret = 1; /* resume guest */ @@ -4111,8 +4072,7 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r return snp_handle_guest_req(svm, req_gpa, resp_gpa); request_invalid: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); return 1; /* resume guest */ } @@ -4120,7 +4080,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); u64 ghcb_info; int ret = 1; @@ -4304,8 +4264,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (ret) return ret; - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); + svm_vmgexit_success(svm, 0); exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { @@ -4340,7 +4299,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); switch (control->exit_info_1) { case 0: @@ -4349,21 +4308,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); + svm_vmgexit_success(svm, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; break; } case SVM_VMGEXIT_HV_FEATURES: - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED); - + svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); ret = 1; break; case SVM_VMGEXIT_TERM_REQUEST: @@ -4384,8 +4341,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) case SVM_VMGEXIT_AP_CREATION: ret = sev_snp_ap_creation(svm); if (ret) { - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; @@ -4435,8 +4391,8 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); } @@ -4445,16 +4401,15 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. * - * guest_can_use() checks a number of requirements on the host/guest to - * ensure that MSR_IA32_XSS is available, but it might report true even - * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host - * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better - * to further check that the guest CPUID actually supports - * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved - * guests will still get intercepted and caught in the normal - * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths. + * KVM treats the guest as being capable of using XSAVES even if XSAVES + * isn't enabled in guest CPUID as there is no intercept for XSAVES, + * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is + * exposed to the guest and XSAVES is supported in hardware. Condition + * full XSS passthrough on the guest being able to use XSAVES *and* + * XSAVES being exposed to the guest so that KVM can at least honor + * guest CPUID for RDMSR and WRMSR. */ - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); else @@ -4477,6 +4432,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; @@ -4492,6 +4448,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; + /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); svm_clr_intercept(svm, INTERCEPT_CR4_READ); @@ -4552,7 +4512,7 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); /* * Set the GHCB MSR value as per the GHCB specification when emulating @@ -4567,6 +4527,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -4590,14 +4552,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. */ - if (sev_vcpu_has_debug_swap(svm)) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); @@ -4622,7 +4592,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * Return from an AP Reset Hold VMGEXIT, where the guest will * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + svm_vmgexit_success(svm, 1); break; case AP_RESET_HOLD_MSR_PROTO: /* @@ -4820,7 +4790,7 @@ static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); kvm_pfn_t pfn_aligned; gfn_t gfn_aligned; int level, rc; @@ -4942,3 +4912,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return level; } + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 21dacd312779..ab9b947dbf4f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -28,8 +28,11 @@ #include <linux/rwsem.h> #include <linux/cc_platform.h> #include <linux/smp.h> +#include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -230,6 +233,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -248,6 +253,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -284,8 +291,6 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -476,24 +481,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -567,7 +566,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } @@ -580,15 +579,15 @@ static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } @@ -617,7 +616,7 @@ static int svm_enable_virtualization_cpu(void) uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -627,9 +626,9 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -650,13 +649,12 @@ static int svm_enable_virtualization_cpu(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -1049,7 +1047,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || - (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) @@ -1187,14 +1185,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { if (!npt_enabled || - !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) + !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); } if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) svm_clr_intercept(svm, INTERCEPT_RDTSCP); else svm_set_intercept(svm, INTERCEPT_RDTSCP); @@ -1298,8 +1296,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1372,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -1481,25 +1486,10 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); - svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1509,6 +1499,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1541,6 +1588,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1551,18 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -1921,9 +1964,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = vcpu->arch.cr4; - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb_current(vcpu); - vcpu->arch.cr4 = cr4; if (!npt_enabled) { cr4 |= X86_CR4_PAE; @@ -1936,7 +1976,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1995,11 +2035,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -2142,14 +2182,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2160,17 +2199,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2224,6 +2257,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -2864,7 +2901,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && - !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2940,7 +2977,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; msr_info->data = svm->virt_spec_ctrl; @@ -2977,11 +3014,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -3024,7 +3057,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -3046,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); @@ -3091,7 +3124,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; if (data & ~SPEC_CTRL_SSBD) @@ -3169,6 +3202,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3263,7 +3306,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) unsigned long type; gva_t gva; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3276,9 +3319,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3346,7 +3431,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -3361,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3406,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3476,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3508,10 +3670,14 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -3533,6 +3699,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } +static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, + u32 *error_code) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *intr_info = control->event_inj; + + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->event_inj_err; + else + *error_code = 0; + +} + static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3576,7 +3757,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -3598,6 +3779,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -4105,20 +4288,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -4178,6 +4364,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4186,6 +4384,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4220,7 +4420,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); - pre_svm_run(vcpu); + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -4236,14 +4441,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4271,6 +4484,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -4392,27 +4609,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give * the guest read/write access to the host's XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); + guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); /* * Intercept VMLOAD if the vCPU model is Intel in order to emulate that * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing * SVM on Intel is bonkers and extremely unlikely to work). */ - if (!guest_cpuid_is_intel_compatible(vcpu)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); + if (guest_cpuid_is_intel_compatible(vcpu)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4422,7 +4629,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); @@ -4673,7 +4880,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) * responsible for ensuring nested SVM and SMIs are mutually exclusive. */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 1; smram->smram64.svm_guest_flag = 1; @@ -4720,14 +4927,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) const struct kvm_smram_state_64 *smram64 = &smram->smram64; - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ if (!smram64->svm_guest_flag) return 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return 1; if (!(smram64->efer & EFER_SVME)) @@ -4790,9 +4997,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { + struct vcpu_svm *svm = to_svm(vcpu); bool smep, smap, is_user; u64 error_code; + /* Check that emulation is possible during event vectoring */ + if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) return X86EMUL_CONTINUE; @@ -4889,7 +5102,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * In addition, don't apply the erratum workaround if the #NPF occurred * while translating guest page tables (see below). */ - error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + error_code = svm->vmcb->control.exit_info_1; if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; @@ -4953,6 +5166,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -4978,6 +5193,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } @@ -5036,6 +5252,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -5077,6 +5294,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, + .get_entry_info = svm_get_entry_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -5147,7 +5365,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -5221,6 +5439,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5328,7 +5549,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5416,6 +5637,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 43fa6a16eb19..e6f3c6a153a0 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -98,6 +98,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ @@ -114,6 +115,9 @@ struct kvm_sev_info { struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; @@ -169,6 +173,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -335,11 +340,11 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; - struct vmcb *current_vmcb; - /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; @@ -358,39 +363,30 @@ static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) return &to_kvm_svm(kvm)->sev_info; } +#ifdef CONFIG_KVM_AMD_SEV static __always_inline bool sev_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif + return to_kvm_sev_info(kvm)->active; } - static __always_inline bool sev_es_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return sev->es_active && !WARN_ON_ONCE(!sev->active); -#else - return false; -#endif } static __always_inline bool sev_snp_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && !WARN_ON_ONCE(!sev_es_guest(kvm)); +} #else - return false; +#define sev_guest(kvm) false +#define sev_es_guest(kvm) false +#define sev_snp_guest(kvm) false #endif -} static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) { @@ -502,7 +498,7 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } @@ -554,7 +550,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } @@ -588,10 +584,39 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) return false; } +static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm, + u64 response, u64 data) +{ + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data); +} + +static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector) +{ + u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector; + + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data); +} + +static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror); +} + +static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + +static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; @@ -722,7 +747,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -void pre_sev_run(struct vcpu_svm *svm, int cpu); +int pre_sev_run(struct vcpu_svm *svm, int cpu); void sev_init_vmcb(struct vcpu_svm *svm); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); @@ -763,6 +788,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -794,6 +821,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} #endif /* vmenter.S */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 2ed80aea3bb1..0c61153b275f 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - 1: vmrun %rax - -2: cli - +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index d3aeffd6ae75..ba736cbb0587 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -11,6 +11,13 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#ifdef CREATE_TRACE_POINTS +#define tracing_kvm_rip_read(vcpu) ({ \ + typeof(vcpu) __vcpu = vcpu; \ + __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \ + }) +#endif + /* * Tracepoint for guest mode entry. */ @@ -22,15 +29,22 @@ TRACE_EVENT(kvm_entry, __field( unsigned int, vcpu_id ) __field( unsigned long, rip ) __field( bool, immediate_exit ) + __field( u32, intr_info ) + __field( u32, error_code ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->rip = kvm_rip_read(vcpu); + __entry->rip = tracing_kvm_rip_read(vcpu); __entry->immediate_exit = force_immediate_exit; + + kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info, + &__entry->error_code); ), - TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, + TP_printk("vcpu %u, rip 0x%lx intr_info 0x%08x error_code 0x%08x%s", + __entry->vcpu_id, __entry->rip, + __entry->intr_info, __entry->error_code, __entry->immediate_exit ? "[immediate exit]" : "") ); @@ -308,12 +322,14 @@ TRACE_EVENT(name, \ __field( u32, intr_info ) \ __field( u32, error_code ) \ __field( unsigned int, vcpu_id ) \ + __field( u64, requests ) \ ), \ \ TP_fast_assign( \ - __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->guest_rip = tracing_kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ + __entry->requests = READ_ONCE(vcpu->requests); \ kvm_x86_call(get_exit_info)(vcpu, \ &__entry->exit_reason, \ &__entry->info1, \ @@ -323,11 +339,13 @@ TRACE_EVENT(name, \ ), \ \ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ - "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x " \ + "requests 0x%016llx", \ __entry->vcpu_id, \ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ __entry->guest_rip, __entry->info1, __entry->info2, \ - __entry->intr_info, __entry->error_code) \ + __entry->intr_info, __entry->error_code, \ + __entry->requests) \ ) /* @@ -412,7 +430,7 @@ TRACE_EVENT(kvm_page_fault, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->guest_rip = kvm_rip_read(vcpu); + __entry->guest_rip = tracing_kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), @@ -819,12 +837,12 @@ TRACE_EVENT(kvm_emulate_insn, TP_ARGS(vcpu, failed), TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH ) + __field( __u8, flags ) + __field( __u8, failed ) ), TP_fast_assign( @@ -835,7 +853,7 @@ TRACE_EVENT(kvm_emulate_insn, __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt->fetch.data, - 15); + X86_MAX_INSTRUCTION_LENGTH); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h new file mode 100644 index 000000000000..a0c5e8781c33 --- /dev/null +++ b/arch/x86/kvm/vmx/common.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_X86_VMX_COMMON_H +#define __KVM_X86_VMX_COMMON_H + +#include <linux/kvm_host.h> +#include <asm/posted_intr.h> + +#include "mmu.h" + +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +struct vcpu_vt { + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + + union vmx_exit_reason exit_reason; + + unsigned long exit_qualification; + u32 exit_intr_info; + + /* + * If true, guest state has been loaded into hardware, and host state + * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into + * hardware. + */ + bool guest_state_loaded; + bool emulation_required; + +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; +#endif + + unsigned long host_debugctlmsr; +}; + +#ifdef CONFIG_KVM_INTEL_TDX + +static __always_inline bool is_td(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) +{ + return is_td(vcpu->kvm); +} + +#else + +static __always_inline bool is_td(struct kvm *kvm) { return false; } +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } + +#endif + +static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) +{ + /* For TDX the direct mask is the shared mask. */ + return !kvm_is_addr_direct(kvm, gpa); +} + +static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long exit_qualification) +{ + u64 error_code; + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + ? PFERR_PRESENT_MASK : 0; + + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) + error_code |= PFERR_PRIVATE_ACCESS; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + /* + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. + * + * When the target is not the running vCPU, the following + * possibilities emerge: + * + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. + * + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. + * + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. + */ + + if (vcpu != kvm_get_running_vcpu()) + __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return; + } +#endif + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); +} + +/* + * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the + * interrupt if necessary. + */ +static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, + struct pi_desc *pi_desc, int vector) +{ + if (pi_test_and_set_pir(vector, pi_desc)) + return; + + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(pi_desc)) + return; + + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); +} + +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_VMX_COMMON_H */ diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index a87407412615..11a339009781 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -42,7 +42,7 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) return vmx->nested.hv_evmcs; } -static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu) { /* * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index a543fccfc574..6536290f4274 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -6,7 +6,7 @@ #ifndef __KVM_X86_VMX_HYPERV_EVMCS_H #define __KVM_X86_VMX_HYPERV_EVMCS_H -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "capabilities.h" #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 92d35cc6cd15..d1e02e567b57 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -3,9 +3,888 @@ #include "x86_ops.h" #include "vmx.h" +#include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" +#include "tdx_arch.h" + +#ifdef CONFIG_KVM_INTEL_TDX +static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt)); + +static void vt_disable_virtualization_cpu(void) +{ + /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ + if (enable_tdx) + tdx_disable_virtualization_cpu(); + vmx_disable_virtualization_cpu(); +} + +static __init int vt_hardware_setup(void) +{ + int ret; + + ret = vmx_hardware_setup(); + if (ret) + return ret; + + /* + * Update vt_x86_ops::vm_size here so it is ready before + * kvm_ops_update() is called in kvm_x86_vendor_init(). + * + * Note, the actual bringing up of TDX must be done after + * kvm_ops_update() because enabling TDX requires enabling + * hardware virtualization first, i.e., all online CPUs must + * be in post-VMXON state. This means the @vm_size here + * may be updated to TDX's size but TDX may fail to enable + * at later time. + * + * The VMX/VT code could update kvm_x86_ops::vm_size again + * after bringing up TDX, but this would require exporting + * either kvm_x86_ops or kvm_ops_update() from the base KVM + * module, which looks overkill. Anyway, the worst case here + * is KVM may allocate couple of more bytes than needed for + * each VM. + */ + if (enable_tdx) { + vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, + sizeof(struct kvm_tdx)); + /* + * Note, TDX may fail to initialize in a later time in + * vt_init(), in which case it is not necessary to setup + * those callbacks. But making them valid here even + * when TDX fails to init later is fine because those + * callbacks won't be called if the VM isn't TDX guest. + */ + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; + } + + return 0; +} + +static int vt_vm_init(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_init(kvm); + + return vmx_vm_init(kvm); +} + +static void vt_vm_pre_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_mmu_release_hkid(kvm); +} + +static void vt_vm_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_destroy(kvm); + + vmx_vm_destroy(kvm); +} + +static int vt_vcpu_precreate(struct kvm *kvm) +{ + if (is_td(kvm)) + return 0; + + return vmx_vcpu_precreate(kvm); +} + +static int vt_vcpu_create(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_create(vcpu); + + return vmx_vcpu_create(vcpu); +} + +static void vt_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_free(vcpu); + return; + } + + vmx_vcpu_free(vcpu); +} + +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_reset(vcpu, init_event); + return; + } + + vmx_vcpu_reset(vcpu, init_event); +} + +static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_load(vcpu, cpu); + return; + } + + vmx_vcpu_load(vcpu, cpu); +} + +static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + /* + * Basic TDX does not support feature PML. KVM does not enable PML in + * TD's VMCS, nor does it allocate or flush PML buffer for TDX. + */ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_update_cpu_dirty_logging(vcpu); +} + +static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_prepare_switch_to_guest(vcpu); + return; + } + + vmx_prepare_switch_to_guest(vcpu); +} + +static void vt_vcpu_put(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_put(vcpu); + return; + } + + vmx_vcpu_put(vcpu); +} + +static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_pre_run(vcpu); + + return vmx_vcpu_pre_run(vcpu); +} + +static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_run(vcpu, force_immediate_exit); + + return vmx_vcpu_run(vcpu, force_immediate_exit); +} + +static int vt_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath) +{ + if (is_td_vcpu(vcpu)) + return tdx_handle_exit(vcpu, fastpath); + + return vmx_handle_exit(vcpu, fastpath); +} + +static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_set_msr(vcpu, msr_info); + + return vmx_set_msr(vcpu, msr_info); +} + +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vt_has_emulated_msr(struct kvm *kvm, u32 index) +{ + if (kvm && is_td(kvm)) + return tdx_has_emulated_msr(index); + + return vmx_has_emulated_msr(kvm, index); +} + +static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_get_msr(vcpu, msr_info); + + return vmx_get_msr(vcpu, msr_info); +} + +static void vt_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + /* + * TDX doesn't allow VMM to configure interception of MSR accesses. + * TDX guest requests MSR accesses by calling TDVMCALL. The MSR + * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR + * if the userspace has set any. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_msr_filter_changed(vcpu); +} + +static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (is_td_vcpu(vcpu)) + return tdx_complete_emulated_msr(vcpu, err); + + return vmx_complete_emulated_msr(vcpu, err); +} + +#ifdef CONFIG_KVM_SMM +static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_smi_allowed(vcpu, for_injection); +} + +static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_enter_smm(vcpu, smram); +} + +static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_leave_smm(vcpu, smram); +} + +static void vt_enable_smi_window(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return; + + /* RSM will cause a vmexit anyway. */ + vmx_enable_smi_window(vcpu); +} +#endif + +static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + /* + * For TDX, this can only be triggered for MMIO emulation. Let the + * guest retry after installing the SPTE with suppress #VE bit cleared, + * so that the guest will receive #VE when retry. The guest is expected + * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on + * #VE. + */ + if (is_td_vcpu(vcpu)) + return X86EMUL_RETRY_INSTR; + + return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len); +} + +static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + /* + * INIT and SIPI are always blocked for TDX, i.e., INIT handling and + * the OP vcpu_deliver_sipi_vector() won't be called. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_apic_init_signal_blocked(vcpu); +} + +static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + /* Only x2APIC mode is supported for TD. */ + if (is_td_vcpu(vcpu)) + return; + + return vmx_set_virtual_apic_mode(vcpu); +} + +static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + if (is_td_vcpu(vcpu)) + return; + + return vmx_hwapic_isr_update(vcpu, max_isr); +} + +static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return -1; + + return vmx_sync_pir_to_irr(vcpu); +} + +static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + if (is_td_vcpu(apic->vcpu)) { + tdx_deliver_interrupt(apic, delivery_mode, trig_mode, + vector); + return; + } + + vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); +} + +static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_vcpu_after_set_cpuid(vcpu); +} + +static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_exception_bitmap(vcpu); +} + +static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_segment_base(vcpu, seg); +} + +static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) { + memset(var, 0, sizeof(*var)); + return; + } + + vmx_get_segment(vcpu, var, seg); +} + +static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_segment(vcpu, var, seg); +} + +static int vt_get_cpl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl(vcpu); +} + +static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl_no_cache(vcpu); +} + +static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + if (is_td_vcpu(vcpu)) { + *db = 0; + *l = 0; + return; + } + + vmx_get_cs_db_l_bits(vcpu, db, l); +} + +static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr0(vcpu, cr0); +} + +static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr0(vcpu, cr0); +} + +static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr4(vcpu, cr4); +} + +static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr4(vcpu, cr4); +} + +static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_set_efer(vcpu, efer); +} + +static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_idt(vcpu, dt); +} + +static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_idt(vcpu, dt); +} + +static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_gdt(vcpu, dt); +} + +static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_gdt(vcpu, dt); +} + +static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr6(vcpu, val); +} + +static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr7(vcpu, val); +} + +static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + /* + * MOV-DR exiting is always cleared for TD guest, even in debug mode. + * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never + * reach here for TD vcpu. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_sync_dirty_debug_regs(vcpu); +} + +static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_cache_reg(vcpu, reg); +} + +static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_rflags(vcpu); +} + +static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_rflags(vcpu, rflags); +} + +static bool vt_get_if_flag(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_if_flag(vcpu); +} + +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_all(vcpu); + return; + } + + vmx_flush_tlb_all(vcpu); +} + +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_current(vcpu); + return; + } + + vmx_flush_tlb_current(vcpu); +} + +static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_gva(vcpu, addr); +} + +static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_guest(vcpu); +} + +static void vt_inject_nmi(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_inject_nmi(vcpu); + return; + } + + vmx_inject_nmi(vcpu); +} + +static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + /* + * The TDX module manages NMI windows and NMI reinjection, and hides NMI + * blocking, all KVM can do is throw an NMI over the wall. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_nmi_allowed(vcpu, for_injection); +} + +static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get NMI blocking status for TDX guest, assume NMIs are + * always unmasked. + */ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_nmi_mask(vcpu); +} + +static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_nmi_mask(vcpu, masked); +} + +static void vt_enable_nmi_window(struct kvm_vcpu *vcpu) +{ + /* Refer to the comments in tdx_inject_nmi(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_nmi_window(vcpu); +} + +static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int pgd_level) +{ + if (is_td_vcpu(vcpu)) { + tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level); + return; + } + + vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); +} + +static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_interrupt_shadow(vcpu, mask); +} + +static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_interrupt_shadow(vcpu); +} + +static void vt_patch_hypercall(struct kvm_vcpu *vcpu, + unsigned char *hypercall) +{ + /* + * Because guest memory is protected, guest can't be patched. TD kernel + * is modified to use TDG.VP.VMCALL for hypercall. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_patch_hypercall(vcpu, hypercall); +} + +static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_irq(vcpu, reinjected); +} + +static void vt_inject_exception(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_exception(vcpu); +} + +static void vt_cancel_injection(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_injection(vcpu); +} + +static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (is_td_vcpu(vcpu)) + return tdx_interrupt_allowed(vcpu); + + return vmx_interrupt_allowed(vcpu, for_injection); +} + +static void vt_enable_irq_window(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_irq_window(vcpu); +} + +static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = 0; + *error_code = 0; + + if (is_td_vcpu(vcpu)) + return; + + vmx_get_entry_info(vcpu, intr_info, error_code); +} + +static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + if (is_td_vcpu(vcpu)) { + tdx_get_exit_info(vcpu, reason, info1, info2, intr_info, + error_code); + return; + } + + vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); +} + +static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_cr8_intercept(vcpu, tpr, irr); +} + +static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_apic_access_page_addr(vcpu); +} + +static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm); + return; + } + + vmx_refresh_apicv_exec_ctrl(vcpu); +} + +static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + +static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_tss_addr(kvm, addr); +} + +static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_identity_map_addr(kvm, ident_addr); +} + +static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_offset(vcpu); +} + +static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_multiplier(vcpu); +} + +static void vt_write_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc offset can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_offset(vcpu); +} + +static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc multiplier can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_multiplier(vcpu); +} + +#ifdef CONFIG_X86_64 +static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) +{ + /* VMX-preemption timer isn't available for TDX. */ + if (is_td_vcpu(vcpu)) + return -EINVAL; + + return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired); +} + +static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_hv_timer(vcpu); +} +#endif + +static void vt_setup_mce(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_setup_mce(vcpu); +} + +static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) +{ + if (!is_td(kvm)) + return -ENOTTY; + + return tdx_vm_ioctl(kvm, argp); +} + +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_ioctl(vcpu, argp); +} + +static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + if (is_td(kvm)) + return tdx_gmem_private_max_mapping_level(kvm, pfn); + + return 0; +} + +#define vt_op(name) vt_##name +#define vt_op_tdx_only(name) vt_##name +#else /* CONFIG_KVM_INTEL_TDX */ +#define vt_op(name) vmx_##name +#define vt_op_tdx_only(name) NULL +#endif /* CONFIG_KVM_INTEL_TDX */ #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -24,110 +903,113 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, .enable_virtualization_cpu = vmx_enable_virtualization_cpu, - .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .disable_virtualization_cpu = vt_op(disable_virtualization_cpu), .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, - .has_emulated_msr = vmx_has_emulated_msr, + .has_emulated_msr = vt_op(has_emulated_msr), .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, + .vm_init = vt_op(vm_init), + .vm_destroy = vt_op(vm_destroy), + .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy), + + .vcpu_precreate = vt_op(vcpu_precreate), + .vcpu_create = vt_op(vcpu_create), + .vcpu_free = vt_op(vcpu_free), + .vcpu_reset = vt_op(vcpu_reset), - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, + .prepare_switch_to_guest = vt_op(prepare_switch_to_guest), + .vcpu_load = vt_op(vcpu_load), + .vcpu_put = vt_op(vcpu_put), - .update_exception_bitmap = vmx_update_exception_bitmap, + .update_exception_bitmap = vt_op(update_exception_bitmap), .get_feature_msr = vmx_get_feature_msr, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cpl_no_cache = vmx_get_cpl_no_cache, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, + .get_msr = vt_op(get_msr), + .set_msr = vt_op(set_msr), + + .get_segment_base = vt_op(get_segment_base), + .get_segment = vt_op(get_segment), + .set_segment = vt_op(set_segment), + .get_cpl = vt_op(get_cpl), + .get_cpl_no_cache = vt_op(get_cpl_no_cache), + .get_cs_db_l_bits = vt_op(get_cs_db_l_bits), + .is_valid_cr0 = vt_op(is_valid_cr0), + .set_cr0 = vt_op(set_cr0), + .is_valid_cr4 = vt_op(is_valid_cr4), + .set_cr4 = vt_op(set_cr4), + .set_efer = vt_op(set_efer), + .get_idt = vt_op(get_idt), + .set_idt = vt_op(set_idt), + .get_gdt = vt_op(get_gdt), + .set_gdt = vt_op(set_gdt), + .set_dr6 = vt_op(set_dr6), + .set_dr7 = vt_op(set_dr7), + .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs), + .cache_reg = vt_op(cache_reg), + .get_rflags = vt_op(get_rflags), + .set_rflags = vt_op(set_rflags), + .get_if_flag = vt_op(get_if_flag), + + .flush_tlb_all = vt_op(flush_tlb_all), + .flush_tlb_current = vt_op(flush_tlb_current), + .flush_tlb_gva = vt_op(flush_tlb_gva), + .flush_tlb_guest = vt_op(flush_tlb_guest), + + .vcpu_pre_run = vt_op(vcpu_pre_run), + .vcpu_run = vt_op(vcpu_run), + .handle_exit = vt_op(handle_exit), .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, + .set_interrupt_shadow = vt_op(set_interrupt_shadow), + .get_interrupt_shadow = vt_op(get_interrupt_shadow), + .patch_hypercall = vt_op(patch_hypercall), + .inject_irq = vt_op(inject_irq), + .inject_nmi = vt_op(inject_nmi), + .inject_exception = vt_op(inject_exception), + .cancel_injection = vt_op(cancel_injection), + .interrupt_allowed = vt_op(interrupt_allowed), + .nmi_allowed = vt_op(nmi_allowed), + .get_nmi_mask = vt_op(get_nmi_mask), + .set_nmi_mask = vt_op(set_nmi_mask), + .enable_nmi_window = vt_op(enable_nmi_window), + .enable_irq_window = vt_op(enable_irq_window), + .update_cr8_intercept = vt_op(update_cr8_intercept), .x2apic_icr_is_split = false, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, + .set_virtual_apic_mode = vt_op(set_virtual_apic_mode), + .set_apic_access_page_addr = vt_op(set_apic_access_page_addr), + .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl), + .load_eoi_exitmap = vt_op(load_eoi_exitmap), + .apicv_pre_state_restore = pi_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, + .hwapic_isr_update = vt_op(hwapic_isr_update), + .sync_pir_to_irr = vt_op(sync_pir_to_irr), + .deliver_interrupt = vt_op(deliver_interrupt), .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, + .set_tss_addr = vt_op(set_tss_addr), + .set_identity_map_addr = vt_op(set_identity_map_addr), .get_mt_mask = vmx_get_mt_mask, - .get_exit_info = vmx_get_exit_info, + .get_exit_info = vt_op(get_exit_info), + .get_entry_info = vt_op(get_entry_info), - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid), .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, + .get_l2_tsc_offset = vt_op(get_l2_tsc_offset), + .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier), + .write_tsc_offset = vt_op(write_tsc_offset), + .write_tsc_multiplier = vt_op(write_tsc_multiplier), - .load_mmu_pgd = vmx_load_mmu_pgd, + .load_mmu_pgd = vt_op(load_mmu_pgd), .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .cpu_dirty_log_size = PML_ENTITY_NUM, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging), .nested_ops = &vmx_nested_ops, @@ -135,35 +1017,95 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, + .set_hv_timer = vt_op(set_hv_timer), + .cancel_hv_timer = vt_op(cancel_hv_timer), #endif - .setup_mce = vmx_setup_mce, + .setup_mce = vt_op(setup_mce), #ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, + .smi_allowed = vt_op(smi_allowed), + .enter_smm = vt_op(enter_smm), + .leave_smm = vt_op(leave_smm), + .enable_smi_window = vt_op(enable_smi_window), #endif - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .check_emulate_instruction = vt_op(check_emulate_instruction), + .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), .migrate_timers = vmx_migrate_timers, - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, + .msr_filter_changed = vt_op(msr_filter_changed), + .complete_emulated_msr = vt_op(complete_emulated_msr), .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, .get_untagged_addr = vmx_get_untagged_addr, + + .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), + .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), + + .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) }; struct kvm_x86_init_ops vt_init_ops __initdata = { - .hardware_setup = vmx_hardware_setup, + .hardware_setup = vt_op(hardware_setup), .handle_intel_pt_intr = NULL, .runtime_ops = &vt_x86_ops, .pmu_ops = &intel_pmu_ops, }; + +static void __exit vt_exit(void) +{ + kvm_exit(); + tdx_cleanup(); + vmx_exit(); +} +module_exit(vt_exit); + +static int __init vt_init(void) +{ + unsigned vcpu_size, vcpu_align; + int r; + + r = vmx_init(); + if (r) + return r; + + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + + /* + * TDX and VMX have different vCPU structures. Calculate the + * maximum size/align so that kvm_init() can use the larger + * values to create the kmem_vcpu_cache. + */ + vcpu_size = sizeof(struct vcpu_vmx); + vcpu_align = __alignof__(struct vcpu_vmx); + if (enable_tdx) { + vcpu_size = max_t(unsigned, vcpu_size, + sizeof(struct vcpu_tdx)); + vcpu_align = max_t(unsigned, vcpu_align, + __alignof__(struct vcpu_tdx)); + kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM); + } + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: + vmx_exit(); + return r; +} +module_init(vt_init); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index aa78b6f38dfe..7211c71d4241 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6,6 +6,7 @@ #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include <asm/msr.h> #include "x86.h" #include "cpuid.h" @@ -257,7 +258,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem. */ - if (!guest_cpuid_has_evmcs(vcpu) || + if (!guest_cpu_cap_has_evmcs(vcpu) || !evmptr_is_valid(nested_get_evmptr(vcpu))) return false; @@ -275,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, { struct vmcs_host_state *dest, *src; - if (unlikely(!vmx->guest_state_loaded)) + if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; @@ -301,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); @@ -425,7 +426,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes. */ - WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); /* * PML Full and EPT Violation VM-Exits both use bit 12 to report @@ -824,12 +825,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; + /* + * Exceeding the limit results in architecturally _undefined_ behavior, + * i.e. KVM is allowed to do literally anything in response to a bad + * limit. Immediately generate a consistency check so that code that + * consumes the count doesn't need to worry about extreme edge cases. + */ + if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) + return -EINVAL; + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; @@ -940,15 +959,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } -static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; -} - /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. @@ -965,7 +975,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), @@ -1053,7 +1063,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) @@ -2089,7 +2099,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( bool evmcs_gpa_changed = false; u64 evmcs_gpa; - if (likely(!guest_cpuid_has_evmcs(vcpu))) + if (likely(!guest_cpu_cap_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; evmcs_gpa = nested_get_evmptr(vcpu); @@ -2970,7 +2980,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; @@ -2992,7 +3002,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_KVM_HYPERV - if (guest_cpuid_has_evmcs(vcpu)) + if (guest_cpu_cap_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); #endif @@ -3287,7 +3297,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (guest_cpuid_has_evmcs(vcpu) && + if (guest_cpu_cap_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -3442,7 +3452,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) if (!nested_cpu_has_pml(vmcs12)) return 0; - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { vmx->nested.pml_full = true; return 1; } @@ -3481,14 +3491,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } -static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) -{ - u8 rvi = vmx_get_rvi(); - u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); @@ -3508,7 +3510,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; - bool evaluate_pending_interrupts; union vmx_exit_reason exit_reason = { .basic = EXIT_REASON_INVALID_STATE, .failed_vmentry = 1, @@ -3527,13 +3528,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, kvm_service_local_tlb_flush_requests(vcpu); - evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); - if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) - evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!evaluate_pending_interrupts) - evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); - if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); @@ -3616,9 +3610,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit - * unconditionally. + * unconditionally. Take care to pull data from vmcs01 as appropriate, + * e.g. when checking for interrupt windows, as vmcs02 is now loaded. */ - if (unlikely(evaluate_pending_interrupts)) + if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING)) || + kvm_apic_has_pending_init_or_sipi(vcpu) || + kvm_apic_has_interrupt(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); /* @@ -3751,14 +3749,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; - /* Emulate processing of posted interrupts on VM-Enter. */ - if (nested_cpu_has_posted_intr(vmcs12) && - kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); - } - /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3791,7 +3781,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -4220,13 +4210,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) */ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* - * New events (not exceptions) are only recognized at instruction + * Events that don't require injection, i.e. that are virtualized by + * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need + * to regain control in order to deliver the event, and hardware will + * handle event ordering, e.g. with respect to injected exceptions. + * + * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a - * VM-Exit that occurred _during_ instruction execution; new events are - * blocked until the instruction completes. + * VM-Exit that occurred _during_ instruction execution; new events, + * irrespective of whether or not they're injected, are blocked until + * the instruction completes. + */ + bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); + /* + * Inject events are blocked by nested VM-Enter, as KVM is responsible + * for managing priority between concurrent events, i.e. KVM needs to + * wait until after VM-Enter completes to deliver injected events. */ bool block_nested_events = block_nested_exceptions || - kvm_event_needs_reinjection(vcpu); + block_non_injected_events; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -4338,18 +4340,26 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { int irq; - if (block_nested_events) - return -EBUSY; - if (!nested_exit_on_intr(vcpu)) + if (!nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + goto no_vmexit; + } if (!nested_exit_intr_ack_set(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0; } irq = kvm_cpu_get_extint(vcpu); if (irq != -1) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); return 0; @@ -4368,11 +4378,22 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) * and enabling posted interrupts requires ACK-on-exit. */ if (irq == vmx->nested.posted_intr_nv) { + /* + * Nested posted interrupts are delivered via RVI, i.e. + * aren't injected by KVM, and so can be queued even if + * manual event injection is disallowed. + */ + if (block_non_injected_events) + return -EBUSY; + vmx->nested.pi_pending = true; kvm_apic_clear_irr(vcpu, irq); goto no_vmexit; } + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); @@ -4509,12 +4530,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } @@ -4607,11 +4628,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; - if (to_vmx(vcpu)->exit_reason.enclave_mode) + if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; @@ -4635,7 +4656,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4783,7 +4804,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -4919,8 +4940,9 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4970,7 +4992,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -5007,16 +5030,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - /* - * If IBRS is advertised to the vCPU, KVM must flush the indirect - * branch predictors when transitioning from L2 to L1, as L1 expects - * hardware (KVM in this case) to provide separate predictor modes. - * Bare metal isolates VMX root (host) from VMX non-root (guest), but - * doesn't isolate different VMCSs, i.e. in this case, doesn't provide - * separate modes for L2 vs L1. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - indirect_branch_prediction_barrier(); + kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -5050,12 +5064,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } + if (vmx->nested.update_vmcs01_hwapic_isr) { + vmx->nested.update_vmcs01_hwapic_isr = false; + kvm_apic_update_hwapic_isr(vcpu); + } + if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { if (vm_exit_reason != -1) @@ -5068,6 +5087,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } @@ -5300,9 +5330,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); @@ -6099,7 +6128,7 @@ fail: * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ - nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -6279,7 +6308,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, { u32 encls_leaf; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) return false; @@ -6544,7 +6573,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; @@ -6617,7 +6646,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (guest_can_use(vcpu, X86_FEATURE_VMX) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6758,7 +6787,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6792,7 +6821,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!guest_can_use(vcpu, X86_FEATURE_VMX) || + (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; @@ -7174,8 +7203,8 @@ static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 2c296b6abb8c..6eedcfc91070 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification); +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len); + +static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + u32 exit_insn_len; + + if (to_vmx(vcpu)->fail || vm_exit_reason == -1 || + (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_insn_len = 0; + else + exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info, + exit_qualification, exit_insn_len); +} + void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9c9d4a336166..bbf4509f32d0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -13,12 +13,14 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" #include "nested.h" #include "pmu.h" +#include "tdx.h" /* * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX @@ -34,6 +36,24 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) +static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc; +} + +static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc.records; +} + +#pragma GCC poison to_vmx + static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; @@ -110,7 +130,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return 0; return vcpu->arch.perf_capabilities; @@ -129,6 +149,22 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } +static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return cpuid_model_is_consistent(vcpu); +} + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return !!vcpu_to_lbr_records(vcpu)->nr; +} + static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) { struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); @@ -160,7 +196,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: perf_capabilities = vcpu_get_perf_capabilities(vcpu); @@ -194,6 +230,9 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) { struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + if (lbr_desc->event) { perf_event_release_kernel(lbr_desc->event); lbr_desc->event = NULL; @@ -235,6 +274,9 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) PERF_SAMPLE_BRANCH_USER, }; + if (WARN_ON_ONCE(!lbr_desc)) + return 0; + if (unlikely(lbr_desc->event)) { __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); return 0; @@ -279,9 +321,9 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, local_irq_disable(); if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { if (read) - rdmsrl(index, msr_info->data); + rdmsrq(index, msr_info->data); else - wrmsrl(index, msr_info->data); + wrmsrq(index, msr_info->data); __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); return true; @@ -466,6 +508,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) u64 perf_capabilities; u64 counter_rsvd; + if (!lbr_desc) + return; + memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); /* @@ -542,7 +587,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (cpuid_model_is_consistent(vcpu) && + if (intel_pmu_lbr_is_compatible(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else @@ -570,6 +615,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; @@ -677,6 +725,9 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (WARN_ON_ONCE(!lbr_desc)) + return; + if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h new file mode 100644 index 000000000000..5620d0882cdc --- /dev/null +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_PMU_INTEL_H +#define __KVM_X86_VMX_PMU_INTEL_H + +#include <linux/kvm_host.h> + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + +extern struct x86_pmu_lbr vmx_lbr_caps; + +#endif /* __KVM_X86_VMX_PMU_INTEL_H */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43..5c615e5845bf 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -11,6 +11,7 @@ #include "posted_intr.h" #include "trace.h" #include "vmx.h" +#include "tdx.h" /* * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() @@ -31,9 +32,11 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { - return &(to_vmx(vcpu)->pi_desc); + return &(to_vt(vcpu)->pi_desc); } static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) @@ -53,7 +56,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -89,9 +92,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); + list_del(&vt->pi_wakeup_list); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -134,9 +148,8 @@ after_clear_sn: static bool vmx_can_use_vtd_pi(struct kvm *kvm) { - return irqchip_in_kernel(kvm) && enable_apicv && - kvm_arch_has_assigned_device(kvm) && - irq_remapping_cap(IRQ_POSTING_CAP); + return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && + kvm_arch_has_assigned_device(kvm); } /* @@ -146,14 +159,26 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vmx->pi_wakeup_list, + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); + list_add_tail(&vt->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) @@ -190,7 +213,8 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) * notification vector is switched to the one that calls * back to the pi_wakeup_handler() function. */ - return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); + return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) || + vmx_can_use_vtd_pi(vcpu->kvm); } void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -200,7 +224,9 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) if (!vmx_needs_pi_wakeup(vcpu)) return; - if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + if (kvm_vcpu_is_blocking(vcpu) && + ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || + (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) pi_enable_wakeup_handler(vcpu); /* @@ -220,13 +246,13 @@ void pi_wakeup_handler(void) int cpu = smp_processor_id(); struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); - struct vcpu_vmx *vmx; + struct vcpu_vt *vt; raw_spin_lock(spinlock); - list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { + list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { - if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_wake_up(&vmx->vcpu); + if (pi_test_on(&vt->pi_desc)) + kvm_vcpu_wake_up(vt_to_vcpu(vt)); } raw_spin_unlock(spinlock); } @@ -237,6 +263,14 @@ void __init pi_init_cpu(int cpu) raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi = vcpu_to_pi_desc(vcpu); + + pi_clear_on(pi); + memset(pi->pir, 0, sizeof(pi->pir)); +} + bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -254,7 +288,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) */ void vmx_pi_start_assignment(struct kvm *kvm) { - if (!irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_irq_bypass()) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); @@ -274,6 +308,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -312,21 +347,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -334,11 +356,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + + enable_remapped_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -346,6 +369,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index ad9116a99bcc..80499ea0e674 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -9,6 +9,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); @@ -18,7 +19,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { int vec; - vec = find_last_bit((unsigned long *)pi_desc->pir, 256); + vec = find_last_bit(pi_desc->pir, 256); return vec < 256 ? vec : -1; } diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b352a3ba7354..df1d0cf76947 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -2,6 +2,7 @@ /* Copyright(c) 2021 Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/msr.h> #include <asm/sgx.h> #include "x86.h" @@ -122,7 +123,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) * likely than a bad userspace address. */ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && - guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) { memset(&ex, 0, sizeof(ex)); ex.vector = PF_VECTOR; ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | @@ -365,7 +366,7 @@ static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) return true; if (leaf >= EAUG && leaf <= EMODT) - return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2); return false; } @@ -381,8 +382,8 @@ int handle_encls(struct kvm_vcpu *vcpu) { u32 leaf = (u32)kvm_rax_read(vcpu); - if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) || - !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || + !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) { kvm_queue_exception(vcpu, UD_VECTOR); } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) || !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { @@ -411,16 +412,16 @@ void setup_default_sgx_lepubkeyhash(void) * MSRs exist but are read-only (locked and not writable). */ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || - rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; } else { /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); } } @@ -479,15 +480,15 @@ void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!cpu_has_vmx_encls_vmexit()) return; - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) && sgx_enabled_in_guest_bios(vcpu)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) { bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); if (sgx_intercept_encls_ecreate(vcpu)) bitmap |= (1 << ECREATE); } - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) bitmap &= ~GENMASK_ULL(EMODT, EAUG); /* diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..1ad20c273f3b --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,3595 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cleanup.h> +#include <linux/cpu.h> +#include <asm/cpufeature.h> +#include <asm/fpu/xcr.h> +#include <linux/misc_cgroup.h> +#include <linux/mmu_context.h> +#include <asm/tdx.h> +#include "capabilities.h" +#include "mmu.h" +#include "x86_ops.h" +#include "lapic.h" +#include "tdx.h" +#include "vmx.h" +#include "mmu/spte.h" +#include "common.h" +#include "posted_intr.h" +#include "irq.h" +#include <trace/events/kvm.h> +#include "trace.h" + +#pragma GCC poison to_vmx + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define pr_tdx_error(__fn, __err) \ + pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) + +#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ + pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) + +#define pr_tdx_error_1(__fn, __err, __rcx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) + +#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) + +#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) + +bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) +#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) + +static enum cpuhp_state tdx_cpuhp_state; + +static const struct tdx_sys_info *tdx_sysinfo; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); +} + +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); +} + +#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) + +static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_tdx, kvm); +} + +static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_tdx, vcpu); +} + +static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = KVM_SUPPORTED_TD_ATTRS; + + if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) + return 0; + + val &= td_conf->attributes_fixed0; + + return val; +} + +static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; + + if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) + return 0; + + val &= td_conf->xfam_fixed0; + + return val; +} + +static int tdx_get_guest_phys_addr_bits(const u32 eax) +{ + return (eax & GENMASK(23, 16)) >> 16; +} + +static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) +{ + return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; +} + +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) + +static bool has_tsx(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ebx & TDX_FEATURE_TSX); +} + +static void clear_tsx(struct kvm_cpuid_entry2 *entry) +{ + entry->ebx &= ~TDX_FEATURE_TSX; +} + +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); +} + +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) +{ + entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); +} + +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) +{ + if (has_tsx(entry)) + clear_tsx(entry); + + if (has_waitpkg(entry)) + clear_waitpkg(entry); +} + +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) +{ + return has_tsx(entry) || has_waitpkg(entry); +} + +#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) + +static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + + entry->function = (u32)td_conf->cpuid_config_leaves[idx]; + entry->index = td_conf->cpuid_config_leaves[idx] >> 32; + entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; + entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; + entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; + entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; + + if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) + entry->index = 0; + + /* + * The TDX module doesn't allow configuring the guest phys addr bits + * (EAX[23:16]). However, KVM uses it as an interface to the userspace + * to configure the GPAW. Report these bits as configurable. + */ + if (entry->function == 0x80000008) + entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); + + tdx_clear_unsupported_cpuid(entry); +} + +static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, + struct kvm_tdx_capabilities *caps) +{ + int i; + + caps->supported_attrs = tdx_get_supported_attrs(td_conf); + if (!caps->supported_attrs) + return -EIO; + + caps->supported_xfam = tdx_get_supported_xfam(td_conf); + if (!caps->supported_xfam) + return -EIO; + + caps->cpuid.nent = td_conf->num_cpuid_config; + + for (i = 0; i < td_conf->num_cpuid_config; i++) + td_init_cpuid_entry2(&caps->cpuid.entries[i], i); + + return 0; +} + +/* + * Some SEAMCALLs acquire the TDX module globally, and can fail with + * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. + */ +static DEFINE_MUTEX(tdx_lock); + +static atomic_t nr_configured_hkid; + +static bool tdx_operand_busy(u64 err) +{ + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; +} + + +/* + * A per-CPU list of TD vCPUs associated with a given CPU. + * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU + * list. + * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of + * the old CPU during the IPI callback running on the old CPU, and then added + * to the per-CPU list of the new CPU. + * - When a TD is tearing down, all vCPUs are disassociated from their current + * running CPUs and removed from the per-CPU list during the IPI callback + * running on those CPUs. + * - When a CPU is brought down, traverse the per-CPU list to disassociate all + * associated TD vCPUs and remove them from the per-CPU list. + */ +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); + +static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r10; +} + +static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r11; +} + +static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, + long val) +{ + to_tdx(vcpu)->vp_enter_args.r10 = val; +} + +static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, + unsigned long val) +{ + to_tdx(vcpu)->vp_enter_args.r11 = val; +} + +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) +{ + tdx_guest_keyid_free(kvm_tdx->hkid); + kvm_tdx->hkid = -1; + atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; +} + +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) +{ + return kvm_tdx->hkid > 0; +} + +static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) +{ + lockdep_assert_irqs_disabled(); + + list_del(&to_tdx(vcpu)->cpu_list); + + /* + * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, + * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU + * to its list before it's deleted from this CPU's list. + */ + smp_wmb(); + + vcpu->cpu = -1; +} + +static void tdx_clear_page(struct page *page) +{ + const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); + void *dest = page_to_virt(page); + unsigned long i; + + /* + * The page could have been poisoned. MOVDIR64B also clears + * the poison bit so the kernel can safely use the page again. + */ + for (i = 0; i < PAGE_SIZE; i += 64) + movdir64b(dest + i, zero_page); + /* + * MOVDIR64B store uses WC buffer. Prevent following memory reads + * from seeing potentially poisoned cache. + */ + __mb(); +} + +static void tdx_no_vcpus_enter_start(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +static void tdx_no_vcpus_enter_stop(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); +} + +/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ +static int __tdx_reclaim_page(struct page *page) +{ + u64 err, rcx, rdx, r8; + + err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); + + /* + * No need to check for TDX_OPERAND_BUSY; all TD pages are freed + * before the HKID is released and control pages have also been + * released at this point, so there is no possibility of contention. + */ + if (WARN_ON_ONCE(err)) { + pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + return -EIO; + } + return 0; +} + +static int tdx_reclaim_page(struct page *page) +{ + int r; + + r = __tdx_reclaim_page(page); + if (!r) + tdx_clear_page(page); + return r; +} + + +/* + * Reclaim the TD control page(s) which are crypto-protected by TDX guest's + * private KeyID. Assume the cache associated with the TDX private KeyID has + * been flushed. + */ +static void tdx_reclaim_control_page(struct page *ctrl_page) +{ + /* + * Leak the page if the kernel failed to reclaim the page. + * The kernel cannot use it safely anymore. + */ + if (tdx_reclaim_page(ctrl_page)) + return; + + __free_page(ctrl_page); +} + +struct tdx_flush_vp_arg { + struct kvm_vcpu *vcpu; + u64 err; +}; + +static void tdx_flush_vp(void *_arg) +{ + struct tdx_flush_vp_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + u64 err; + + arg->err = 0; + lockdep_assert_irqs_disabled(); + + /* Task migration can race with CPU offlining. */ + if (unlikely(vcpu->cpu != raw_smp_processor_id())) + return; + + /* + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The + * list tracking still needs to be updated so that it's correct if/when + * the vCPU does get initialized. + */ + if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { + /* + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This + * vp flush function is called when destructing vCPU/TD or vCPU + * migration. No other thread uses TDVPR in those cases. + */ + err = tdh_vp_flush(&to_tdx(vcpu)->vp); + if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { + /* + * This function is called in IPI context. Do not use + * printk to avoid console semaphore. + * The caller prints out the error message, instead. + */ + if (err) + arg->err = err; + } + } + + tdx_disassociate_vp(vcpu); +} + +static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) +{ + struct tdx_flush_vp_arg arg = { + .vcpu = vcpu, + }; + int cpu = vcpu->cpu; + + if (unlikely(cpu == -1)) + return; + + smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); + if (KVM_BUG_ON(arg.err, vcpu->kvm)) + pr_tdx_error(TDH_VP_FLUSH, arg.err); +} + +void tdx_disable_virtualization_cpu(void) +{ + int cpu = raw_smp_processor_id(); + struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); + struct tdx_flush_vp_arg arg; + struct vcpu_tdx *tdx, *tmp; + unsigned long flags; + + local_irq_save(flags); + /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ + list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { + arg.vcpu = &tdx->vcpu; + tdx_flush_vp(&arg); + } + local_irq_restore(flags); +} + +#define TDX_SEAMCALL_RETRIES 10000 + +static void smp_func_do_phymem_cache_wb(void *unused) +{ + u64 err = 0; + bool resume; + int i; + + /* + * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private + * KeyID on the package or core. The TDX module may not finish the + * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The + * kernel should retry it until it returns success w/o rescheduling. + */ + for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { + resume = !!err; + err = tdh_phymem_cache_wb(resume); + switch (err) { + case TDX_INTERRUPTED_RESUMABLE: + continue; + case TDX_NO_HKID_READY_TO_WBCACHE: + err = TDX_SUCCESS; /* Already done by other thread */ + fallthrough; + default: + goto out; + } + } + +out: + if (WARN_ON_ONCE(err)) + pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); +} + +void tdx_mmu_release_hkid(struct kvm *kvm) +{ + bool packages_allocated, targets_allocated; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages, targets; + struct kvm_vcpu *vcpu; + unsigned long j; + int i; + u64 err; + + if (!is_hkid_assigned(kvm_tdx)) + return; + + packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); + targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); + cpus_read_lock(); + + kvm_for_each_vcpu(j, vcpu, kvm) + tdx_flush_vp_on_cpu(vcpu); + + /* + * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock + * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. + * Multiple TDX guests can be destroyed simultaneously. Take the + * mutex to prevent it from getting error. + */ + mutex_lock(&tdx_lock); + + /* + * Releasing HKID is in vm_destroy(). + * After the above flushing vps, there should be no more vCPU + * associations, as all vCPU fds have been released at this stage. + */ + err = tdh_mng_vpflushdone(&kvm_tdx->td); + if (err == TDX_FLUSHVP_NOT_DONE) + goto out; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + goto out; + } + + for_each_online_cpu(i) { + if (packages_allocated && + cpumask_test_and_set_cpu(topology_physical_package_id(i), + packages)) + continue; + if (targets_allocated) + cpumask_set_cpu(i, targets); + } + if (targets_allocated) + on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); + else + on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); + /* + * In the case of error in smp_func_do_phymem_cache_wb(), the following + * tdh_mng_key_freeid() will fail. + */ + err = tdh_mng_key_freeid(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_KEY_FREEID, err); + pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + } else { + tdx_hkid_free(kvm_tdx); + } + +out: + mutex_unlock(&tdx_lock); + cpus_read_unlock(); + free_cpumask_var(targets); + free_cpumask_var(packages); +} + +static void tdx_reclaim_td_control_pages(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + int i; + + /* + * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong + * heavily with TDX module. Give up freeing TD pages. As the function + * already warned, don't warn it again. + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (kvm_tdx->td.tdcs_pages) { + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (!kvm_tdx->td.tdcs_pages[i]) + continue; + + tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); + } + kfree(kvm_tdx->td.tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + } + + if (!kvm_tdx->td.tdr_page) + return; + + if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) + return; + + /* + * Use a SEAMCALL to ask the TDX module to flush the cache based on the + * KeyID. TDX module may access TDR while operating on TD (Especially + * when it is reclaiming TDCS). + */ + err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return; + } + tdx_clear_page(kvm_tdx->td.tdr_page); + + __free_page(kvm_tdx->td.tdr_page); + kvm_tdx->td.tdr_page = NULL; +} + +void tdx_vm_destroy(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + tdx_reclaim_td_control_pages(kvm); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; +} + +static int tdx_do_tdh_mng_key_config(void *param) +{ + struct kvm_tdx *kvm_tdx = param; + u64 err; + + /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ + err = tdh_mng_key_config(&kvm_tdx->td); + + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { + pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + return -EIO; + } + + return 0; +} + +int tdx_vm_init(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + kvm->arch.has_protected_state = true; + kvm->arch.has_private_mem = true; + kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + /* + * Because guest TD is protected, VMM can't parse the instruction in TD. + * Instead, guest uses MMIO hypercall. For unmodified device driver, + * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO + * instruction into MMIO hypercall. + * + * SPTE value for MMIO needs to be setup so that #VE is injected into + * TD instead of triggering EPT MISCONFIG. + * - RWX=0 so that EPT violation is triggered. + * - suppress #VE bit is cleared to inject #VE. + */ + kvm_mmu_set_mmio_spte_value(kvm, 0); + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports + * such limit via the MAX_VCPU_PER_TD global metadata. In + * practice, it reflects the number of logical CPUs that ALL + * platforms that the TDX module supports can possibly have. + * + * Limit TDX guest's maximum vCPUs to the number of logical CPUs + * the platform has. Simply forwarding the MAX_VCPU_PER_TD to + * userspace would result in an unpredictable ABI. + */ + kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; + + return 0; +} + +int tdx_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (kvm_tdx->state != TD_STATE_INITIALIZED) + return -EIO; + + /* + * TDX module mandates APICv, which requires an in-kernel local APIC. + * Disallow an in-kernel I/O APIC, because level-triggered interrupts + * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. + */ + if (!irqchip_split(vcpu->kvm)) + return -EINVAL; + + fpstate_set_confidential(&vcpu->arch.guest_fpu); + vcpu->arch.apic->guest_apic_protected = true; + INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); + + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; + + vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; + vcpu->arch.cr0_guest_owned_bits = -1ul; + vcpu->arch.cr4_guest_owned_bits = -1ul; + + /* KVM can't change TSC offset/multiplier as TDX module manages them. */ + vcpu->arch.guest_tsc_protected = true; + vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; + vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + + vcpu->arch.guest_state_protected = + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); + + if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) + vcpu->arch.xfd_no_write_intercept = true; + + tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&tdx->vt.pi_desc); + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; + + return 0; +} + +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + vmx_vcpu_pi_load(vcpu, cpu); + if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) + return; + + tdx_flush_vp_on_cpu(vcpu); + + KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); + local_irq_disable(); + /* + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure + * vcpu->cpu is read before tdx->cpu_list. + */ + smp_rmb(); + + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); + local_irq_enable(); +} + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get the interrupt status of TDX guest and it assumes + * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, + * which passes the interrupt blocked flag. + */ + return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + !to_tdx(vcpu)->vp_enter_args.r12; +} + +bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + u64 vcpu_state_details; + + if (pi_has_pending_interrupt(vcpu)) + return true; + + /* + * Only check RVI pending for HALTED case with IRQ enabled. + * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the + * interrupt was pending before TD exit, then it _must_ be blocked, + * otherwise the interrupt would have been serviced at the instruction + * boundary. + */ + if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + to_tdx(vcpu)->vp_enter_args.r12) + return false; + + vcpu_state_details = + td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); + + return tdx_vcpu_state_details_intr_pending(vcpu_state_details); +} + +/* + * Compared to vmx_prepare_switch_to_guest(), there is not much to do + * as SEAMCALL/SEAMRET calls take care of most of save and restore. + */ +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->host_debugctlmsr = get_debugctlmsr(); + + vt->guest_state_loaded = true; +} + +struct tdx_uret_msr { + u32 msr; + unsigned int slot; + u64 defval; +}; + +static struct tdx_uret_msr tdx_uret_msrs[] = { + {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, + {.msr = MSR_STAR,}, + {.msr = MSR_LSTAR,}, + {.msr = MSR_TSC_AUX,}, +}; + +static void tdx_user_return_msr_update_cache(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) + kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval); +} + +static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (!vt->guest_state_loaded) + return; + + ++vcpu->stat.host_state_reload; + wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); + + if (tdx->guest_entered) { + tdx_user_return_msr_update_cache(); + tdx->guest_entered = false; + } + + vt->guest_state_loaded = false; +} + +void tdx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + tdx_prepare_switch_to_host(vcpu); +} + +void tdx_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + int i; + + /* + * It is not possible to reclaim pages while hkid is assigned. It might + * be assigned if: + * 1. the TD VM is being destroyed but freeing hkid failed, in which + * case the pages are leaked + * 2. TD VCPU creation failed and this on the error path, in which case + * there is nothing to do anyway + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (tdx->vp.tdcx_pages) { + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + } + if (tdx->vp.tdvpr_page) { + tdx_reclaim_control_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + } + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; +} + +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || + to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) + return -EINVAL; + + return 1; +} + +static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case EXIT_REASON_CPUID: + case EXIT_REASON_HLT: + case EXIT_REASON_IO_INSTRUCTION: + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return tdvmcall_leaf(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return EXIT_REASON_EPT_MISCONFIG; + default: + break; + } + + return EXIT_REASON_TDCALL; +} + +static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u32 exit_reason; + + switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { + case TDX_SUCCESS: + case TDX_NON_RECOVERABLE_VCPU: + case TDX_NON_RECOVERABLE_TD: + case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: + case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: + break; + default: + return -1u; + } + + exit_reason = tdx->vp_enter_ret; + + switch (exit_reason) { + case EXIT_REASON_TDCALL: + if (tdvmcall_exit_type(vcpu)) + return EXIT_REASON_VMCALL; + + return tdcall_to_vmx_exit_reason(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + /* + * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in + * non-instrumentable code with interrupts disabled. + */ + return -1u; + default: + break; + } + + return exit_reason; +} + +static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + guest_state_enter_irqoff(); + + tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); + + vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); + + vt->exit_qualification = tdx->vp_enter_args.rcx; + tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; + tdx->exit_gpa = tdx->vp_enter_args.r8; + vt->exit_intr_info = tdx->vp_enter_args.r9; + + vmx_handle_nmi(vcpu); + + guest_state_exit_irqoff(); +} + +static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) +{ + return vmx_get_exit_reason(vcpu).failed_vmentry && + vmx_get_exit_reason(vcpu).full != -1u; +} + +static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; + + /* + * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation + * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. + * + * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both + * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target + * vCPUs leaving fastpath so that interrupt can be enabled to ensure the + * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of + * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the + * requester may be blocked endlessly. + */ + if (unlikely(tdx_operand_busy(vp_enter_ret))) + return EXIT_FASTPATH_EXIT_HANDLED; + + return EXIT_FASTPATH_NONE; +} + +#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ + BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ + BIT_ULL(VCPU_REGS_RAX) | \ + BIT_ULL(VCPU_REGS_RBX) | \ + BIT_ULL(VCPU_REGS_RCX) | \ + BIT_ULL(VCPU_REGS_RDX) | \ + BIT_ULL(VCPU_REGS_RBP) | \ + BIT_ULL(VCPU_REGS_RSI) | \ + BIT_ULL(VCPU_REGS_RDI) | \ + BIT_ULL(VCPU_REGS_R8) | \ + BIT_ULL(VCPU_REGS_R9) | \ + BIT_ULL(VCPU_REGS_R10) | \ + BIT_ULL(VCPU_REGS_R11) | \ + BIT_ULL(VCPU_REGS_R12) | \ + BIT_ULL(VCPU_REGS_R13) | \ + BIT_ULL(VCPU_REGS_R14) | \ + BIT_ULL(VCPU_REGS_R15)) + +static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + + /* + * All TDX hosts support PKRU; but even if they didn't, + * vcpu->arch.host_pkru would be 0 and the wrpkru would be + * skipped. + */ + if (vcpu->arch.host_pkru != 0) + wrpkru(vcpu->arch.host_pkru); + + if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + /* + * Likewise, even if a TDX hosts didn't support XSS both arms of + * the comparison would be 0 and the wrmsrl would be skipped. + */ + if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) + wrmsrl(MSR_IA32_XSS, kvm_host.xss); +} + +#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ + DEBUGCTLMSR_FREEZE_IN_SMM) + +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + /* + * force_immediate_exit requires vCPU entering for events injection with + * an immediately exit followed. But The TDX module doesn't guarantee + * entry, it's already possible for KVM to _think_ it completely entry + * to the guest without actually having done so. + * Since KVM never needs to force an immediate exit for TDX, and can't + * do direct injection, just warn on force_immediate_exit. + */ + WARN_ON_ONCE(force_immediate_exit); + + /* + * Wait until retry of SEPT-zap-related SEAMCALL completes before + * allowing vCPU entry to avoid contention with tdh_vp_enter() and + * TDCALLs. + */ + if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) + return EXIT_FASTPATH_EXIT_HANDLED; + + trace_kvm_entry(vcpu, force_immediate_exit); + + if (pi_test_on(&vt->pi_desc)) { + apic->send_IPI_self(POSTED_INTR_VECTOR); + + if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & + APIC_VECTOR_MASK, &vt->pi_desc)) + kvm_wait_lapic_expire(vcpu); + } + + tdx_vcpu_enter_exit(vcpu); + + if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) + update_debugctlmsr(vt->host_debugctlmsr); + + tdx_load_host_xsave_state(vcpu); + tdx->guest_entered = true; + + vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; + + if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) + return EXIT_FASTPATH_NONE; + + if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) + return EXIT_FASTPATH_NONE; + + if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) + kvm_machine_check(); + + trace_kvm_exit(vcpu, KVM_ISA_VMX); + + if (unlikely(tdx_failed_vmentry(vcpu))) + return EXIT_FASTPATH_NONE; + + return tdx_exit_handlers_fastpath(vcpu); +} + +void tdx_inject_nmi(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.nmi_injections; + td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); + /* + * From KVM's perspective, NMI injection is completed right after + * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by + * the TDX module or not. + */ + vcpu->arch.nmi_injected = false; + /* + * TDX doesn't support KVM to request NMI window exit. If there is + * still a pending vNMI, KVM is not able to inject it along with the + * one pending in TDX module in a back-to-back way. Since the previous + * vNMI is still pending in TDX module, i.e. it has not been delivered + * to TDX guest yet, it's OK to collapse the pending vNMI into the + * previous one. The guest is expected to handle all the NMI sources + * when handling the first vNMI. + */ + vcpu->arch.nmi_pending = 0; +} + +static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) +{ + u32 intr_info = vmx_get_intr_info(vcpu); + + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on + * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). + */ + if (is_nmi(intr_info) || is_machine_check(intr_info)) + return 1; + + vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; + vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + vcpu->run->ex.error_code = 0; + + return 0; +} + +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); + return 1; +} + +static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) +{ + kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); + kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); + kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); + kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); + kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); + + return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); +} + +/* + * Split into chunks and check interrupt pending between chunks. This allows + * for timely injection of interrupts to prevent issues with guest lockup + * detection. + */ +#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) +static void __tdx_map_gpa(struct vcpu_tdx *tdx); + +static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (vcpu->run->hypercall.ret) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; + if (tdx->map_gpa_next >= tdx->map_gpa_end) + return 1; + + /* + * Stop processing the remaining part if there is a pending interrupt, + * which could be qualified to deliver. Skip checking pending RVI for + * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). + */ + if (kvm_vcpu_has_events(vcpu)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + __tdx_map_gpa(tdx); + return 0; +} + +static void __tdx_map_gpa(struct vcpu_tdx *tdx) +{ + u64 gpa = tdx->map_gpa_next; + u64 size = tdx->map_gpa_end - tdx->map_gpa_next; + + if (size > TDX_MAP_GPA_MAX_LEN) + size = TDX_MAP_GPA_MAX_LEN; + + tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; + tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + tdx->vcpu.run->hypercall.ret = 0; + tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; + tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? + KVM_MAP_GPA_RANGE_ENCRYPTED : + KVM_MAP_GPA_RANGE_DECRYPTED; + tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; + + tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; +} + +static int tdx_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + u64 ret; + + /* + * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires + * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE + * bit set. This is a base call so it should always be supported, but + * KVM has no way to ensure that userspace implements the GHCI correctly. + * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error + * to the guest. + */ + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + goto error; + } + + if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || + !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || + (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { + ret = TDVMCALL_STATUS_INVALID_OPERAND; + goto error; + } + + if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { + ret = TDVMCALL_STATUS_ALIGN_ERROR; + goto error; + } + + tdx->map_gpa_end = gpa + size; + tdx->map_gpa_next = gpa; + + __tdx_map_gpa(tdx); + return 0; + +error: + tdvmcall_set_return_code(vcpu, ret); + tdx->vp_enter_args.r11 = gpa; + return 1; +} + +static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 *regs = vcpu->run->system_event.data; + u64 *module_regs = &tdx->vp_enter_args.r8; + int index = VCPU_REGS_RAX; + + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; + vcpu->run->system_event.ndata = 16; + + /* Dump 16 general-purpose registers to userspace in ascending order. */ + regs[index++] = tdx->vp_enter_ret; + regs[index++] = tdx->vp_enter_args.rcx; + regs[index++] = tdx->vp_enter_args.rdx; + regs[index++] = tdx->vp_enter_args.rbx; + regs[index++] = 0; + regs[index++] = 0; + regs[index++] = tdx->vp_enter_args.rsi; + regs[index] = tdx->vp_enter_args.rdi; + for (index = 0; index < 8; index++) + regs[VCPU_REGS_R8 + index] = module_regs[index]; + + return 0; +} + +static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 eax, ebx, ecx, edx; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* EAX and ECX for cpuid is stored in R12 and R13. */ + eax = tdx->vp_enter_args.r12; + ecx = tdx->vp_enter_args.r13; + + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); + + tdx->vp_enter_args.r12 = eax; + tdx->vp_enter_args.r13 = ebx; + tdx->vp_enter_args.r14 = ecx; + tdx->vp_enter_args.r15 = edx; + + return 1; +} + +static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + +static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + int ret; + + ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + + WARN_ON_ONCE(!ret); + + tdvmcall_set_return_val(vcpu, val); + + return 1; +} + +static int tdx_emulate_io(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + unsigned int port; + u64 size, write; + int ret; + + ++vcpu->stat.io_exits; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + port = tdx->vp_enter_args.r14; + + if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (write) { + val = tdx->vp_enter_args.r15; + ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); + } else { + ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); + } + + if (!ret) + vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : + tdx_complete_pio_in; + else if (!write) + tdvmcall_set_return_val(vcpu, val); + + return ret; +} + +static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) +{ + unsigned long val = 0; + gpa_t gpa; + int size; + + gpa = vcpu->mmio_fragments[0].gpa; + size = vcpu->mmio_fragments[0].len; + + memcpy(&val, vcpu->run->mmio.data, size); + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 1; +} + +static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, + unsigned long val) +{ + if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return 0; + } + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); + if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + return 0; +} + +static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) +{ + unsigned long val; + + if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 0; +} + +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + int size, write, r; + unsigned long val; + gpa_t gpa; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + gpa = tdx->vp_enter_args.r14; + val = write ? tdx->vp_enter_args.r15 : 0; + + if (size != 1 && size != 2 && size != 4 && size != 8) + goto error; + if (write != 0 && write != 1) + goto error; + + /* + * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to + * do MMIO emulation for private GPA. + */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) + goto error; + + gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + + if (write) + r = tdx_mmio_write(vcpu, gpa, size, val); + else + r = tdx_mmio_read(vcpu, gpa, size); + if (!r) + /* Kernel completed device emulation. */ + return 1; + + /* Request the device emulation to userspace device model. */ + vcpu->mmio_is_write = write; + if (!write) + vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = size; + vcpu->run->mmio.is_write = write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + if (write) { + memcpy(vcpu->run->mmio.data, &val, size); + } else { + vcpu->mmio_fragments[0].gpa = gpa; + vcpu->mmio_fragments[0].len = size; + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); + } + return 0; + +error: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; +} + +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + +static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + switch (tdx->vp_enter_args.r12) { + case 0: + tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; + tdx->vp_enter_args.r13 = 0; + tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } +} + +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + +static int handle_tdvmcall(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case TDVMCALL_MAP_GPA: + return tdx_map_gpa(vcpu); + case TDVMCALL_REPORT_FATAL_ERROR: + return tdx_report_fatal_error(vcpu); + case TDVMCALL_GET_TD_VM_CALL_INFO: + return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); + default: + break; + } + + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); + return 1; +} + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) +{ + u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : + TDX_SHARED_BIT_PWL_4; + + if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) + return; + + td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); +} + +static void tdx_unpin(struct kvm *kvm, struct page *page) +{ + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + tdx_unpin(kvm, page); + return -EBUSY; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, page); + return -EIO; + } + + return 0; +} + +/* + * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the + * callback tdx_gmem_post_populate() then maps pages into private memory. + * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the + * private EPT structures for the page to have been built before, which is + * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that + * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). + * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there + * are no half-initialized shared EPT pages. + */ +static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EINVAL; + + /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; +} + +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(page); + + /* + * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching + * barrier in tdx_td_finalize(). + */ + smp_rmb(); + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) + return tdx_mem_page_aug(kvm, gfn, level, page); + + return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* + * The second retry is expected to succeed after kicking off all + * other vCPUs and prevent them from invoking TDH.VP.ENTER. + */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(page); + tdx_unpin(kvm, page); + return 0; +} + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + struct page *page = virt_to_page(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, + &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +/* + * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is + * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called + * successfully. + * + * Since tdh_mem_sept_add() must have been invoked successfully before a + * non-leaf entry present in the mirrored page table, the SEPT ZAP related + * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead + * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the + * SEPT. + * + * Further check if the returned entry from SEPT walking is with RWX permissions + * to filter out anything unexpected. + * + * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from + * level_state returned from a SEAMCALL error is the same as that passed into + * the SEAMCALL. + */ +static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, + u64 entry, int level) +{ + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + return false; + + if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) + return false; + + if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) + return false; + + return true; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && + !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + atomic64_dec(&kvm_tdx->nr_premapped); + tdx_unpin(kvm, page); + return 0; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 1; +} + +/* + * Ensure shared and private EPTs to be flushed on all vCPUs. + * tdh_mem_track() is the only caller that increases TD epoch. An increase in + * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are + * running in guest mode with the value "N - 1". + * + * A successful execution of tdh_mem_track() ensures that vCPUs can only run in + * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch + * being increased to "N + 1". + * + * Kicking off all vCPUs after that further results in no vCPUs can run in guest + * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. + * to increase TD epoch to "N + 2"). + * + * TDX module will flush EPT on the next TD enter and make vCPUs to run in + * guest mode with TD epoch value "N + 1". + * + * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by + * waiting empty IPI handler ack_kick(). + * + * No action is required to the vCPUs being kicked off since the kicking off + * occurs certainly after TD epoch increment and before the next + * tdh_mem_track(). + */ +static void tdx_track(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + + /* If TD isn't finalized, it's before any vcpu running. */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return; + + lockdep_assert_held_write(&kvm->mmu_lock); + + err = tdh_mem_track(&kvm_tdx->td); + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_track(&kvm_tdx->td); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) + pr_tdx_error(TDH_MEM_TRACK, err); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(virt_to_page(private_spt)); +} + +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + int ret; + + /* + * HKID is released after all private pages have been removed, and set + * before any might be populated. Warn if zapping is attempted when + * there can't be anything populated in the private EPT. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); + if (ret <= 0) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, page); +} + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* TDX supports only posted interrupt. No lapic emulation. */ + __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); +} + +static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) +{ + u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; + u64 eq = vmx_get_exit_qual(vcpu); + + if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) + return false; + + return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); +} + +static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual; + gpa_t gpa = to_tdx(vcpu)->exit_gpa; + bool local_retry = false; + int ret; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + if (tdx_is_sept_violation_unexpected_pending(vcpu)) { + pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", + gpa, vcpu->vcpu_id); + kvm_vm_dead(vcpu->kvm); + return -EIO; + } + /* + * Always treat SEPT violations as write faults. Ignore the + * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. + * TD private pages are always RWX in the SEPT tables, + * i.e. they're always mapped writable. Just as importantly, + * treating SEPT violations as write faults is necessary to + * avoid COW allocations, which will cause TDAUGPAGE failures + * due to aliasing a single HPA to multiple GPAs. + */ + exit_qual = EPT_VIOLATION_ACC_WRITE; + + /* Only private GPA triggers zero-step mitigation */ + local_retry = true; + } else { + exit_qual = vmx_get_exit_qual(vcpu); + /* + * EPT violation due to instruction fetch should never be + * triggered from shared memory in TDX guest. If such EPT + * violation occurs, treat it as broken hardware. + */ + if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) + return -EIO; + } + + trace_kvm_page_fault(vcpu, gpa, exit_qual); + + /* + * To minimize TDH.VP.ENTER invocations, retry locally for private GPA + * mapping in TDX. + * + * KVM may return RET_PF_RETRY for private GPA due to + * - contentions when atomically updating SPTEs of the mirror page table + * - in-progress GFN invalidation or memslot removal. + * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, + * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) + * or certain TDCALLs. + * + * If TDH.VP.ENTER is invoked more times than the threshold set by the + * TDX module before KVM resolves the private GPA mapping, the TDX + * module will activate zero-step mitigation during TDH.VP.ENTER. This + * process acquires an SEPT tree lock in the TDX module, leading to + * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD + * operations on other vCPUs. + * + * Breaking out of local retries for kvm_vcpu_has_events() is for + * interrupt injection. kvm_vcpu_has_events() should not see pending + * events for TDX. Since KVM can't determine if IRQs (or NMIs) are + * blocked by TDs, false positives are inevitable i.e., KVM may re-enter + * the guest even if the IRQ/NMI can't be delivered. + * + * Note: even without breaking out of local retries, zero-step + * mitigation may still occur due to + * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, + * - a single RIP causing EPT violations for more GFNs than the + * threshold count. + * This is safe, as triggering zero-step mitigation only introduces + * contentions to page installation SEAMCALLs on other vCPUs, which will + * handle retries locally in their EPT violation handlers. + */ + while (1) { + ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); + + if (ret != RET_PF_RETRY || !local_retry) + break; + + if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) + break; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + ret = -EIO; + break; + } + + cond_resched(); + } + return ret; +} + +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) + tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); + + return 1; +} + + +int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 vp_enter_ret = tdx->vp_enter_ret; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); + + if (fastpath != EXIT_FASTPATH_NONE) + return 1; + + if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { + KVM_BUG_ON(1, vcpu->kvm); + return -EIO; + } + + /* + * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and + * TDX_SEAMCALL_VMFAILINVALID. + */ + if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { + KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); + goto unhandled_exit; + } + + if (unlikely(tdx_failed_vmentry(vcpu))) { + /* + * If the guest state is protected, that means off-TD debug is + * not enabled, TDX_NON_RECOVERABLE must be set. + */ + WARN_ON_ONCE(vcpu->arch.guest_state_protected && + !(vp_enter_ret & TDX_NON_RECOVERABLE)); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; + return 0; + } + + if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && + exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { + kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); + goto unhandled_exit; + } + + WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && + (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); + + switch (exit_reason.basic) { + case EXIT_REASON_TRIPLE_FAULT: + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + return 0; + case EXIT_REASON_EXCEPTION_NMI: + return tdx_handle_exception_nmi(vcpu); + case EXIT_REASON_EXTERNAL_INTERRUPT: + ++vcpu->stat.irq_exits; + return 1; + case EXIT_REASON_CPUID: + return tdx_emulate_cpuid(vcpu); + case EXIT_REASON_HLT: + return kvm_emulate_halt_noskip(vcpu); + case EXIT_REASON_TDCALL: + return handle_tdvmcall(vcpu); + case EXIT_REASON_VMCALL: + return tdx_emulate_vmcall(vcpu); + case EXIT_REASON_IO_INSTRUCTION: + return tdx_emulate_io(vcpu); + case EXIT_REASON_MSR_READ: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + return kvm_emulate_rdmsr(vcpu); + case EXIT_REASON_MSR_WRITE: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); + kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); + return kvm_emulate_wrmsr(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + return tdx_emulate_mmio(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return tdx_handle_ept_violation(vcpu); + case EXIT_REASON_OTHER_SMI: + /* + * Unlike VMX, SMI in SEAM non-root mode (i.e. when + * TD guest vCPU is running) will cause VM exit to TDX module, + * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered + * and handled by kernel handler right away. + * + * The Other SMI exit can also be caused by the SEAM non-root + * machine check delivered via Machine Check System Management + * Interrupt (MSMI), but it has already been handled by the + * kernel machine check handler, i.e., the memory page has been + * marked as poisoned and it won't be freed to the free list + * when the TDX guest is terminated (the TDX module marks the + * guest as dead and prevent it from further running when + * machine check happens in SEAM non-root). + * + * - A MSMI will not reach here, it's handled as non_recoverable + * case above. + * - If it's not an MSMI, no need to do anything here. + */ + return 1; + default: + break; + } + +unhandled_exit: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vp_enter_ret; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + return 0; +} + +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + *reason = tdx->vt.exit_reason.full; + if (*reason != -1u) { + *info1 = vmx_get_exit_qual(vcpu); + *info2 = tdx->ext_exit_qualification; + *intr_info = vmx_get_intr_info(vcpu); + } else { + *info1 = 0; + *info2 = 0; + *intr_info = 0; + } + + *error_code = 0; +} + +bool tdx_has_emulated_msr(u32 index) +{ + switch (index) { + case MSR_IA32_UCODE_REV: + case MSR_IA32_ARCH_CAPABILITIES: + case MSR_IA32_POWER_CTL: + case MSR_IA32_CR_PAT: + case MSR_MTRRcap: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_TSC_DEADLINE: + case MSR_IA32_MISC_ENABLE: + case MSR_PLATFORM_INFO: + case MSR_MISC_FEATURES_ENABLES: + case MSR_IA32_APICBASE: + case MSR_EFER: + case MSR_IA32_FEAT_CTL: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_EXT_CTL: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ + case MSR_KVM_POLL_CONTROL: + return true; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + /* + * x2APIC registers that are virtualized by the CPU can't be + * emulated, KVM doesn't have access to the virtual APIC page. + */ + switch (index) { + case X2APIC_MSR(APIC_TASKPRI): + case X2APIC_MSR(APIC_PROCPRI): + case X2APIC_MSR(APIC_EOI): + case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): + case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): + case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): + return false; + default: + return true; + } + default: + return false; + } +} + +static bool tdx_is_read_only_msr(u32 index) +{ + return index == MSR_IA32_APICBASE || index == MSR_EFER || + index == MSR_IA32_FEAT_CTL; +} + +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_FEAT_CTL: + /* + * MCE and MCA are advertised via cpuid. Guest kernel could + * check if LMCE is enabled or not. + */ + msr->data = FEAT_CTL_LOCKED; + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + msr->data |= FEAT_CTL_LMCE_ENABLED; + return 0; + case MSR_IA32_MCG_EXT_CTL: + if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) + return 1; + msr->data = vcpu->arch.mcg_ext_ctl; + return 0; + default: + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_get_msr_common(vcpu, msr); + } +} + +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_MCG_EXT_CTL: + if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || + (msr->data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = msr->data; + return 0; + default: + if (tdx_is_read_only_msr(msr->index)) + return 1; + + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_set_msr_common(vcpu, msr); + } +} + +static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_tdx_capabilities __user *user_caps; + struct kvm_tdx_capabilities *caps = NULL; + int ret = 0; + + /* flags is reserved for future use */ + if (cmd->flags) + return -EINVAL; + + caps = kmalloc(sizeof(*caps) + + sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, + GFP_KERNEL); + if (!caps) + return -ENOMEM; + + user_caps = u64_to_user_ptr(cmd->data); + if (copy_from_user(caps, user_caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (caps->cpuid.nent < td_conf->num_cpuid_config) { + ret = -E2BIG; + goto out; + } + + ret = init_kvm_tdx_caps(td_conf, caps); + if (ret) + goto out; + + if (copy_to_user(user_caps, caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, + caps->cpuid.nent * + sizeof(caps->cpuid.entries[0]))) + ret = -EFAULT; + +out: + /* kfree() accepts NULL. */ + kfree(caps); + return ret; +} + +/* + * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is + * similar to TDX's GPAW. Use this field as the interface for userspace to + * configure the GPAW and EPT level for TDs. + * + * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level + * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always + * supported. Value 52 is only supported when the platform supports 5 level + * EPT. + */ +static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct kvm_cpuid_entry2 *entry; + int guest_pa; + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); + if (!entry) + return -EINVAL; + + guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); + + if (guest_pa != 48 && guest_pa != 52) + return -EINVAL; + + if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) + return -EINVAL; + + td_params->eptp_controls = VMX_EPTP_MT_WB; + if (guest_pa == 52) { + td_params->eptp_controls |= VMX_EPTP_PWL_5; + td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; + } else { + td_params->eptp_controls |= VMX_EPTP_PWL_4; + } + + return 0; +} + +static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + const struct kvm_cpuid_entry2 *entry; + struct tdx_cpuid_value *value; + int i, copy_cnt = 0; + + /* + * td_params.cpuid_values: The number and the order of cpuid_value must + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} + * It's assumed that td_params was zeroed. + */ + for (i = 0; i < td_conf->num_cpuid_config; i++) { + struct kvm_cpuid_entry2 tmp; + + td_init_cpuid_entry2(&tmp, i); + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, + tmp.function, tmp.index); + if (!entry) + continue; + + if (tdx_unsupported_cpuid(entry)) + return -EINVAL; + + copy_cnt++; + + value = &td_params->cpuid_values[i]; + value->eax = entry->eax; + value->ebx = entry->ebx; + value->ecx = entry->ecx; + value->edx = entry->edx; + + /* + * TDX module does not accept nonzero bits 16..23 for the + * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). + */ + if (tmp.function == 0x80000008) + value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); + } + + /* + * Rely on the TDX module to reject invalid configuration, but it can't + * check of leafs that don't have a proper slot in td_params->cpuid_values + * to stick then. So fail if there were entries that didn't get copied to + * td_params. + */ + if (copy_cnt != cpuid->nent) + return -EINVAL; + + return 0; +} + +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, + struct kvm_tdx_init_vm *init_vm) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; + int ret; + + if (kvm->created_vcpus) + return -EBUSY; + + if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) + return -EINVAL; + + if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) + return -EINVAL; + + td_params->max_vcpus = kvm->max_vcpus; + td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; + td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; + + td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; + td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); + + ret = setup_tdparams_eptp_controls(cpuid, td_params); + if (ret) + return ret; + + ret = setup_tdparams_cpuids(cpuid, td_params); + if (ret) + return ret; + +#define MEMCPY_SAME_SIZE(dst, src) \ + do { \ + BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ + memcpy((dst), (src), sizeof(dst)); \ + } while (0) + + MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); + MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); + MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); + + return 0; +} + +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, + u64 *seamcall_err) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages; + struct page **tdcs_pages = NULL; + struct page *tdr_page; + int ret, i; + u64 err, rcx; + + *seamcall_err = 0; + ret = tdx_guest_keyid_alloc(); + if (ret < 0) + return ret; + kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; + + ret = -ENOMEM; + + atomic_inc(&nr_configured_hkid); + + tdr_page = alloc_page(GFP_KERNEL); + if (!tdr_page) + goto free_hkid; + + kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; + /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ + kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; + tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), + GFP_KERNEL | __GFP_ZERO); + if (!tdcs_pages) + goto free_tdr; + + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + tdcs_pages[i] = alloc_page(GFP_KERNEL); + if (!tdcs_pages[i]) + goto free_tdcs; + } + + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) + goto free_tdcs; + + cpus_read_lock(); + + /* + * Need at least one CPU of the package to be online in order to + * program all packages for host key id. Check it. + */ + for_each_present_cpu(i) + cpumask_set_cpu(topology_physical_package_id(i), packages); + for_each_online_cpu(i) + cpumask_clear_cpu(topology_physical_package_id(i), packages); + if (!cpumask_empty(packages)) { + ret = -EIO; + /* + * Because it's hard for human operator to figure out the + * reason, warn it. + */ +#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" + pr_warn_ratelimited(MSG_ALLPKG); + goto free_packages; + } + + /* + * TDH.MNG.CREATE tries to grab the global TDX module and fails + * with TDX_OPERAND_BUSY when it fails to grab. Take the global + * lock to prevent it from failure. + */ + mutex_lock(&tdx_lock); + kvm_tdx->td.tdr_page = tdr_page; + err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); + mutex_unlock(&tdx_lock); + + if (err == TDX_RND_NO_ENTROPY) { + ret = -EAGAIN; + goto free_packages; + } + + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_CREATE, err); + ret = -EIO; + goto free_packages; + } + + for_each_online_cpu(i) { + int pkg = topology_physical_package_id(i); + + if (cpumask_test_and_set_cpu(pkg, packages)) + continue; + + /* + * Program the memory controller in the package with an + * encryption key associated to a TDX private host key id + * assigned to this TDR. Concurrent operations on same memory + * controller results in TDX_OPERAND_BUSY. No locking needed + * beyond the cpus_read_lock() above as it serializes against + * hotplug and the first online CPU of the package is always + * used. We never have two CPUs in the same socket trying to + * program the key. + */ + ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, + kvm_tdx, true); + if (ret) + break; + } + cpus_read_unlock(); + free_cpumask_var(packages); + if (ret) { + i = 0; + goto teardown; + } + + kvm_tdx->td.tdcs_pages = tdcs_pages; + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); + if (err == TDX_RND_NO_ENTROPY) { + /* Here it's hard to allow userspace to retry. */ + ret = -EAGAIN; + goto teardown; + } + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_ADDCX, err); + ret = -EIO; + goto teardown; + } + } + + err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); + if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { + /* + * Because a user gives operands, don't warn. + * Return a hint to the user because it's sometimes hard for the + * user to figure out which operand is invalid. SEAMCALL status + * code includes which operand caused invalid operand error. + */ + *seamcall_err = err; + ret = -EINVAL; + goto teardown; + } else if (WARN_ON_ONCE(err)) { + pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + ret = -EIO; + goto teardown; + } + + return 0; + + /* + * The sequence for freeing resources from a partially initialized TD + * varies based on where in the initialization flow failure occurred. + * Simply use the full teardown and destroy, which naturally play nice + * with partial initialization. + */ +teardown: + /* Only free pages not yet added, so start at 'i' */ + for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) { + __free_page(tdcs_pages[i]); + tdcs_pages[i] = NULL; + } + } + if (!kvm_tdx->td.tdcs_pages) + kfree(tdcs_pages); + + tdx_mmu_release_hkid(kvm); + tdx_reclaim_td_control_pages(kvm); + + return ret; + +free_packages: + cpus_read_unlock(); + free_cpumask_var(packages); + +free_tdcs: + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) + __free_page(tdcs_pages[i]); + } + kfree(tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + +free_tdr: + if (tdr_page) + __free_page(tdr_page); + kvm_tdx->td.tdr_page = 0; + +free_hkid: + tdx_hkid_free(kvm_tdx); + + return ret; +} + +static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, + u64 *data) +{ + u64 err; + + err = tdh_mng_rd(&tdx->td, field_id, data); + + return err; +} + +#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) +#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) + +static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, + bool sub_leaf_set, int *entry_index, + struct kvm_cpuid_entry2 *out) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; + u64 ebx_eax, edx_ecx; + u64 err = 0; + + if (sub_leaf > 0b1111111) + return -EINVAL; + + if (*entry_index >= KVM_MAX_CPUID_ENTRIES) + return -EINVAL; + + if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || + sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) + return -EINVAL; + + /* + * bit 23:17, REVSERVED: reserved, must be 0; + * bit 16, LEAF_31: leaf number bit 31; + * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are + * implicitly 0; + * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; + * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, + * the SUBLEAF_6_0 is all-1. + * sub-leaf bits 31:7 are implicitly 0; + * bit 0, ELEMENT_I: Element index within field; + */ + field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; + field_id |= (leaf & 0x7f) << 9; + if (sub_leaf_set) + field_id |= (sub_leaf & 0x7f) << 1; + else + field_id |= 0x1fe; + + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); + if (err) //TODO check for specific errors + goto err_out; + + out->eax = (u32) ebx_eax; + out->ebx = (u32) (ebx_eax >> 32); + + field_id++; + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); + /* + * It's weird that reading edx_ecx fails while reading ebx_eax + * succeeded. + */ + if (WARN_ON_ONCE(err)) + goto err_out; + + out->ecx = (u32) edx_ecx; + out->edx = (u32) (edx_ecx >> 32); + + out->function = leaf; + out->index = sub_leaf; + out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; + + /* + * Work around missing support on old TDX modules, fetch + * guest maxpa from gfn_direct_bits. + */ + if (leaf == 0x80000008) { + gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + unsigned int g_maxpa = __ffs(gpa_bits) + 1; + + out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); + } + + (*entry_index)++; + + return 0; + +err_out: + out->eax = 0; + out->ebx = 0; + out->ecx = 0; + out->edx = 0; + + return -EIO; +} + +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_vm *init_vm; + struct td_params *td_params = NULL; + int ret; + + BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); + BUILD_BUG_ON(sizeof(struct td_params) != 1024); + + if (kvm_tdx->state != TD_STATE_UNINITIALIZED) + return -EINVAL; + + if (cmd->flags) + return -EINVAL; + + init_vm = kmalloc(sizeof(*init_vm) + + sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!init_vm) + return -ENOMEM; + + if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { + ret = -EFAULT; + goto out; + } + + if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { + ret = -E2BIG; + goto out; + } + + if (copy_from_user(init_vm->cpuid.entries, + u64_to_user_ptr(cmd->data) + sizeof(*init_vm), + flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { + ret = -EFAULT; + goto out; + } + + if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { + ret = -EINVAL; + goto out; + } + + if (init_vm->cpuid.padding) { + ret = -EINVAL; + goto out; + } + + td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); + if (!td_params) { + ret = -ENOMEM; + goto out; + } + + ret = setup_tdparams(kvm, td_params, init_vm); + if (ret) + goto out; + + ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); + if (ret) + goto out; + + kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); + kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); + kvm_tdx->attributes = td_params->attributes; + kvm_tdx->xfam = td_params->xfam; + + if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; + else + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; + + kvm_tdx->state = TD_STATE_INITIALIZED; +out: + /* kfree() accepts NULL. */ + kfree(init_vm); + kfree(td_params); + + return ret; +} + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + /* + * flush_tlb_current() is invoked when the first time for the vcpu to + * run or when root of shared EPT is invalidated. + * KVM only needs to flush shared EPT because the TDX module handles TLB + * invalidation for private EPT in tdh_vp_enter(); + * + * A single context invalidation for shared EPT can be performed here. + * However, this single context invalidation requires the private EPTP + * rather than the shared EPTP to flush shared EPT, as shared EPT uses + * private EPTP as its ASID for TLB invalidation. + * + * To avoid reading back private EPTP, perform a global invalidation for + * shared EPT instead to keep this function simple. + */ + ept_sync_global(); +} + +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * TDX has called tdx_track() in tdx_sept_remove_private_spte() to + * ensure that private EPT will be flushed on the next TD enter. No need + * to call tdx_track() here again even when this callback is a result of + * zapping private EPT. + * + * Due to the lack of the context to determine which EPT has been + * affected by zapping, invoke invept() directly here for both shared + * EPT and private EPT for simplicity, though it's not necessary for + * private EPT. + */ + ept_sync_global(); +} + +static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + guard(mutex)(&kvm->slots_lock); + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + /* + * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue + * TDH.MEM.PAGE.ADD(). + */ + if (atomic64_read(&kvm_tdx->nr_premapped)) + return -EINVAL; + + cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); + if (tdx_operand_busy(cmd->hw_error)) + return -EBUSY; + if (KVM_BUG_ON(cmd->hw_error, kvm)) { + pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + return -EIO; + } + + kvm_tdx->state = TD_STATE_RUNNABLE; + /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ + smp_wmb(); + kvm->arch.pre_fault_allowed = true; + return 0; +} + +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +{ + struct kvm_tdx_cmd tdx_cmd; + int r; + + if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) + return -EFAULT; + + /* + * Userspace should never set hw_error. It is used to fill + * hardware-defined error by the kernel. + */ + if (tdx_cmd.hw_error) + return -EINVAL; + + mutex_lock(&kvm->lock); + + switch (tdx_cmd.id) { + case KVM_TDX_CAPABILITIES: + r = tdx_get_capabilities(&tdx_cmd); + break; + case KVM_TDX_INIT_VM: + r = tdx_td_init(kvm, &tdx_cmd); + break; + case KVM_TDX_FINALIZE_VM: + r = tdx_td_finalize(kvm, &tdx_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct page *page; + int ret, i; + u64 err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + tdx->vp.tdvpr_page = page; + + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), + GFP_KERNEL); + if (!tdx->vp.tdcx_pages) { + ret = -ENOMEM; + goto free_tdvpr; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_tdcx; + } + tdx->vp.tdcx_pages[i] = page; + } + + err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); + if (KVM_BUG_ON(err, vcpu->kvm)) { + ret = -EIO; + pr_tdx_error(TDH_VP_CREATE, err); + goto free_tdcx; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_ADDCX, err); + /* + * Pages already added are reclaimed by the vcpu_free + * method, but the rest are freed here. + */ + for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + return -EIO; + } + } + + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_INIT, err); + return -EIO; + } + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; + +free_tdcx: + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + +free_tdvpr: + if (tdx->vp.tdvpr_page) + __free_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + + return ret; +} + +/* Sometimes reads multipple subleafs. Return how many enties were written. */ +static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, + struct kvm_cpuid_entry2 *output_e) +{ + int sub_leaf = 0; + int ret; + + /* First try without a subleaf */ + ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); + + /* If success, or invalid leaf, just give up */ + if (ret != -EIO) + return ret; + + /* + * If the try without a subleaf failed, try reading subleafs until + * failure. The TDX module only supports 6 bits of subleaf index. + */ + while (1) { + /* Keep reading subleafs until there is a failure. */ + if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) + return !sub_leaf; + + sub_leaf++; + output_e++; + } + + return 0; +} + +static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct kvm_cpuid2 __user *output, *td_cpuid; + int r = 0, i = 0, leaf; + u32 level; + + output = u64_to_user_ptr(cmd->data); + td_cpuid = kzalloc(sizeof(*td_cpuid) + + sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!td_cpuid) + return -ENOMEM; + + if (copy_from_user(td_cpuid, output, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + /* Read max CPUID for normal range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[0].eax; + + for (leaf = 1; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + /* Read max CPUID for extended range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[i - 1].eax; + + for (leaf = 0x80000001; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + if (td_cpuid->nent < i) + r = -E2BIG; + td_cpuid->nent = i; + + if (copy_to_user(output, td_cpuid, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + if (r == -E2BIG) + goto out; + + if (copy_to_user(output->entries, td_cpuid->entries, + td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + r = -EFAULT; + +out: + kfree(td_cpuid); + + return r; +} + +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + u64 apic_base; + struct vcpu_tdx *tdx = to_tdx(vcpu); + int ret; + + if (cmd->flags) + return -EINVAL; + + if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) + return -EINVAL; + + /* + * TDX requires X2APIC, userspace is responsible for configuring guest + * CPUID accordingly. + */ + apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); + if (kvm_apic_set_base(vcpu, apic_base, true)) + return -EINVAL; + + ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); + if (ret) + return ret; + + td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); + td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); + td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); + + tdx->state = VCPU_TD_STATE_INITIALIZED; + + return 0; +} + +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + /* + * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all + * INIT events. + * + * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as + * userspace needs to define the vCPU model before KVM can initialize + * vCPU state, e.g. to enable x2APIC. + */ + WARN_ON_ONCE(init_event); +} + +struct tdx_gmem_post_populate_arg { + struct kvm_vcpu *vcpu; + __u32 flags; +}; + +static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *_arg) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_gmem_post_populate_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + gpa_t gpa = gfn_to_gpa(gfn); + u8 level = PG_LEVEL_4K; + struct page *src_page; + int ret, i; + u64 err, entry, level_state; + + /* + * Get the source page if it has been faulted in. Return failure if the + * source page has been swapped out or unmapped in primary memory. + */ + ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); + if (ret < 0) + return ret; + if (ret != 1) + return -ENOMEM; + + ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + if (ret < 0) + goto out; + + /* + * The private mem cannot be zapped after kvm_tdp_map_page() + * because all paths are covered by slots_lock and the + * filemap invalidate lock. Check that they are indeed enough. + */ + if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + scoped_guard(read_lock, &kvm->mmu_lock) { + if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { + ret = -EIO; + goto out; + } + } + } + + ret = 0; + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + src_page, &entry, &level_state); + if (err) { + ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; + goto out; + } + + if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + atomic64_dec(&kvm_tdx->nr_premapped); + + if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, + &level_state); + if (err) { + ret = -EIO; + break; + } + } + } + +out: + put_page(src_page); + return ret; +} + +static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_mem_region region; + struct tdx_gmem_post_populate_arg arg; + long gmem_ret; + int ret; + + if (tdx->state != VCPU_TD_STATE_INITIALIZED) + return -EINVAL; + + guard(mutex)(&kvm->slots_lock); + + /* Once TD is finalized, the initial guest memory is fixed. */ + if (kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) + return -EINVAL; + + if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) + return -EFAULT; + + if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || + !region.nr_pages || + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || + !vt_is_tdx_private_gpa(kvm, region.gpa) || + !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) + return -EINVAL; + + kvm_mmu_reload(vcpu); + ret = 0; + while (region.nr_pages) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + arg = (struct tdx_gmem_post_populate_arg) { + .vcpu = vcpu, + .flags = cmd->flags, + }; + gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), + u64_to_user_ptr(region.source_addr), + 1, tdx_gmem_post_populate, &arg); + if (gmem_ret < 0) { + ret = gmem_ret; + break; + } + + if (gmem_ret != 1) { + ret = -EIO; + break; + } + + region.source_addr += PAGE_SIZE; + region.gpa += PAGE_SIZE; + region.nr_pages--; + + cond_resched(); + } + + if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) + ret = -EFAULT; + return ret; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int ret; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (copy_from_user(&cmd, argp, sizeof(cmd))) + return -EFAULT; + + if (cmd.hw_error) + return -EINVAL; + + switch (cmd.id) { + case KVM_TDX_INIT_VCPU: + ret = tdx_vcpu_init(vcpu, &cmd); + break; + case KVM_TDX_INIT_MEM_REGION: + ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; + case KVM_TDX_GET_CPUID: + ret = tdx_vcpu_get_cpuid(vcpu, &cmd); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return PG_LEVEL_4K; +} + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static int tdx_offline_cpu(unsigned int cpu) +{ + int i; + + /* No TD is running. Allow any cpu to be offline. */ + if (!atomic_read(&nr_configured_hkid)) + return 0; + + /* + * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to + * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory + * controller with pconfig. If we have active TDX HKID, refuse to + * offline the last online cpu. + */ + for_each_online_cpu(i) { + /* + * Found another online cpu on the same package. + * Allow to offline. + */ + if (i != cpu && topology_physical_package_id(i) == + topology_physical_package_id(cpu)) + return 0; + } + + /* + * This is the last cpu of this package. Don't offline it. + * + * Because it's hard for human operator to understand the + * reason, warn it. + */ +#define MSG_ALLPKG_ONLINE \ + "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" + pr_warn_ratelimited(MSG_ALLPKG_ONLINE); + return -EBUSY; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, tdx_offline_cpu); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + const struct tdx_sys_info_td_conf *td_conf; + int r, i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { + /* + * Check if MSRs (tdx_uret_msrs) can be saved/restored + * before returning to user space. + * + * this_cpu_ptr(user_return_msrs)->registered isn't checked + * because the registration is done at vcpu runtime by + * tdx_user_return_msr_update_cache(). + */ + tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); + if (tdx_uret_msrs[i].slot == -1) { + /* If any MSR isn't supported, it is a KVM bug */ + pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", + tdx_uret_msrs[i].msr); + return -EIO; + } + } + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* Get TDX global information for later use */ + tdx_sysinfo = tdx_get_sysinfo(); + if (WARN_ON_ONCE(!tdx_sysinfo)) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* Check TDX module and KVM capabilities */ + if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || + !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) + goto get_sysinfo_err; + + if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) + goto get_sysinfo_err; + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to + * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU + * extension on per-VM basis. + * + * TDX module reports such limit via the MAX_VCPU_PER_TD global + * metadata. Different modules may report different values. + * Some old module may also not support this metadata (in which + * case this limit is U16_MAX). + * + * In practice, the reported value reflects the maximum logical + * CPUs that ALL the platforms that the module supports can + * possibly have. + * + * Simply forwarding the MAX_VCPU_PER_TD to userspace could + * result in an unpredictable ABI. KVM instead always advertise + * the number of logical CPUs the platform has as the maximum + * vCPUs for TDX guests. + * + * Make sure MAX_VCPU_PER_TD reported by TDX module is not + * smaller than the number of logical CPUs, otherwise KVM will + * report an unsupported value to userspace. + * + * Note, a platform with TDX enabled in the BIOS cannot support + * physical CPU hotplug, and TDX requires the BIOS has marked + * all logical CPUs in MADT table as enabled. Just use + * num_present_cpus() for the number of logical CPUs. + */ + td_conf = &tdx_sysinfo->td_conf; + if (td_conf->max_vcpus_per_td < num_present_cpus()) { + pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", + td_conf->max_vcpus_per_td, num_present_cpus()); + r = -EINVAL; + goto get_sysinfo_err; + } + + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; + +get_sysinfo_err: + __tdx_cleanup(); +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r, i; + + /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); + + if (!enable_tdx) + return 0; + + if (!enable_ept) { + pr_err("EPT is required for TDX\n"); + goto success_disable_tdx; + } + + if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { + pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); + goto success_disable_tdx; + } + + if (!enable_apicv) { + pr_err("APICv is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + pr_err("tdx: OSXSAVE is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { + pr_err("tdx: MOVDIR64B is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { + pr_err("Self-snoop is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..51f98443e8a2 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#include "tdx_arch.h" +#include "tdx_errno.h" + +#ifdef CONFIG_KVM_INTEL_TDX +#include "common.h" + +int tdx_bringup(void); +void tdx_cleanup(void); + +extern bool enable_tdx; + +/* TDX module hardware states. These follow the TDX module OP_STATEs. */ +enum kvm_tdx_state { + TD_STATE_UNINITIALIZED = 0, + TD_STATE_INITIALIZED, + TD_STATE_RUNNABLE, +}; + +struct kvm_tdx { + struct kvm kvm; + + struct misc_cg *misc_cg; + int hkid; + enum kvm_tdx_state state; + + u64 attributes; + u64 xfam; + + u64 tsc_offset; + u64 tsc_multiplier; + + struct tdx_td td; + + /* For KVM_TDX_INIT_MEM_REGION. */ + atomic64_t nr_premapped; + + /* + * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do + * not contend with tdh_vp_enter() and TDCALLs. + * Set/unset is protected with kvm->mmu_lock. + */ + bool wait_for_sept_zap; +}; + +/* TDX module vCPU states */ +enum vcpu_tdx_state { + VCPU_TD_STATE_UNINITIALIZED = 0, + VCPU_TD_STATE_INITIALIZED, +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; + struct vcpu_vt vt; + u64 ext_exit_qualification; + gpa_t exit_gpa; + struct tdx_module_args vp_enter_args; + + struct tdx_vp vp; + + struct list_head cpu_list; + + u64 vp_enter_ret; + + enum vcpu_tdx_state state; + bool guest_entered; + + u64 map_gpa_next; + u64 map_gpa_end; +}; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err); + +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field) +{ + u64 err, data; + + err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data); + if (unlikely(err)) { + pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err); + return 0; + } + return data; +} + +static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) +{ +#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL +#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL +#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL +#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK) + + /* TDX is 64bit only. HIGH field isn't supported. */ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && + VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH, + "Read/Write to TD VMCS *_HIGH fields not supported"); + + BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64); + +#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13) +#define VMCS_ENC_WIDTH_16BIT (0UL << 13) +#define VMCS_ENC_WIDTH_64BIT (1UL << 13) +#define VMCS_ENC_WIDTH_32BIT (2UL << 13) +#define VMCS_ENC_WIDTH_NATURAL (3UL << 13) +#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK) + + /* TDX is 64bit only. i.e. natural width = 64bit. */ + BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) && + (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT || + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL), + "Invalid TD VMCS access for 64-bit field"); + BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT, + "Invalid TD VMCS access for 32-bit field"); + BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT, + "Invalid TD VMCS access for 16-bit field"); +} + +static __always_inline void tdvps_management_check(u64 field, u8 bits) {} +static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {} + +#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \ +static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \ + u32 field) \ +{ \ + u64 err, data; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \ + if (unlikely(err)) { \ + tdh_vp_rd_failed(tdx, #uclass, field, err); \ + return 0; \ + } \ + return (u##bits)data; \ +} \ +static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \ + u32 field, u##bits val) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \ + GENMASK_ULL(bits - 1, 0)); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \ +} \ +static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \ +} \ +static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ +} + + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu); +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err); + +TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); + +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); +TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); + +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} + +#define enable_tdx 0 + +struct kvm_tdx { + struct kvm kvm; +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; +}; + +static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; } +static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; } + +#endif + +#endif diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h new file mode 100644 index 000000000000..a30e880849e3 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural constants/data definitions for TDX SEAMCALLs */ + +#ifndef __KVM_X86_TDX_ARCH_H +#define __KVM_X86_TDX_ARCH_H + +#include <linux/types.h> + +/* TDX control structure (TDR/TDCS/TDVPS) field access codes */ +#define TDX_NON_ARCH BIT_ULL(63) +#define TDX_CLASS_SHIFT 56 +#define TDX_FIELD_MASK GENMASK_ULL(31, 0) + +#define __BUILD_TDX_FIELD(non_arch, class, field) \ + (((non_arch) ? TDX_NON_ARCH : 0) | \ + ((u64)(class) << TDX_CLASS_SHIFT) | \ + ((u64)(field) & TDX_FIELD_MASK)) + +#define BUILD_TDX_FIELD(class, field) \ + __BUILD_TDX_FIELD(false, (class), (field)) + +#define BUILD_TDX_FIELD_NON_ARCH(class, field) \ + __BUILD_TDX_FIELD(true, (class), (field)) + + +/* Class code for TD */ +#define TD_CLASS_EXECUTION_CONTROLS 17ULL + +/* Class code for TDVPS */ +#define TDVPS_CLASS_VMCS 0ULL +#define TDVPS_CLASS_GUEST_GPR 16ULL +#define TDVPS_CLASS_OTHER_GUEST 17ULL +#define TDVPS_CLASS_MANAGEMENT 32ULL + +enum tdx_tdcs_execution_control { + TD_TDCS_EXEC_TSC_OFFSET = 10, + TD_TDCS_EXEC_TSC_MULTIPLIER = 11, +}; + +enum tdx_vcpu_guest_other_state { + TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100, +}; + +#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0) + +static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details) +{ + return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING); +} + +/* @field is any of enum tdx_tdcs_execution_control */ +#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field)) + +/* @field is the VMCS field encoding */ +#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field)) + +/* @field is any of enum tdx_guest_other_state */ +#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field)) +#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field)) + +/* Management class fields */ +enum tdx_vcpu_guest_management { + TD_VCPU_PEND_NMI = 11, +}; + +/* @field is any of enum tdx_vcpu_guest_management */ +#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field)) + +#define TDX_EXTENDMR_CHUNKSIZE 256 + +struct tdx_cpuid_value { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} __packed; + +#define TDX_TD_ATTR_DEBUG BIT_ULL(0) +#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28) +#define TDX_TD_ATTR_PKS BIT_ULL(30) +#define TDX_TD_ATTR_KL BIT_ULL(31) +#define TDX_TD_ATTR_PERFMON BIT_ULL(63) + +#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0) +#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6 +/* + * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. + */ +struct td_params { + u64 attributes; + u64 xfam; + u16 max_vcpus; + u8 reserved0[6]; + + u64 eptp_controls; + u64 config_flags; + u16 tsc_frequency; + u8 reserved1[38]; + + u64 mrconfigid[6]; + u64 mrowner[6]; + u64 mrownerconfig[6]; + u64 reserved2[4]; + + union { + DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values); + u8 reserved3[768]; + }; +} __packed __aligned(1024); + +/* + * Guest uses MAX_PA for GPAW when set. + * 0: GPA.SHARED bit is GPA[47] + * 1: GPA.SHARED bit is GPA[51] + */ +#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0) + +/* + * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP + * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered. + * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved. + */ +#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2) + + +/* + * TDX requires the frequency to be defined in units of 25MHz, which is the + * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX + * module can only program frequencies that are multiples of 25MHz. The + * frequency must be between 100mhz and 10ghz (inclusive). + */ +#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000)) +#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000)) +#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) +#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) + +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + +#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) + +/* + * TD scope metadata field ID. + */ +#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL + +#endif /* __KVM_X86_TDX_ARCH_H */ diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h new file mode 100644 index 000000000000..6ff4672c4181 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_errno.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural status code for SEAMCALL */ + +#ifndef __KVM_X86_TDX_ERRNO_H +#define __KVM_X86_TDX_ERRNO_H + +#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL + +/* + * TDX SEAMCALL Status Codes (returned in RAX) + */ +#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL +#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL +#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL +#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL +#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL +#define TDX_OPERAND_INVALID 0xC000010000000000ULL +#define TDX_OPERAND_BUSY 0x8000020000000000ULL +#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL +#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL +#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL +#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL +#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL +#define TDX_KEY_CONFIGURED 0x0000081500000000ULL +#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL +#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL +#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL +#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL +#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL + +/* + * TDX module operand ID, appears in 31:0 part of error code as + * detail information + */ +#define TDX_OPERAND_ID_RCX 0x01 +#define TDX_OPERAND_ID_TDR 0x80 +#define TDX_OPERAND_ID_SEPT 0x92 +#define TDX_OPERAND_ID_TD_EPOCH 0xa9 + +#endif /* __KVM_X86_TDX_ERRNO_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index f6986dee6f8c..0a6cf5bff2aa 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -59,8 +59,7 @@ * without the explicit restore, thinks the stack is getting walloped. * Using an unwind hint is problematic due to x86-64's dynamic alignment. */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP + leave RET .endm diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 893366e53732..4953846cb30d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -46,6 +46,7 @@ #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> @@ -53,6 +54,7 @@ #include <trace/events/ipi.h> #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -115,6 +117,8 @@ module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -273,6 +277,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: @@ -380,9 +385,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -393,7 +398,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -769,8 +774,11 @@ void vmx_emergency_disable_virtualization_cpu(void) return; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) + loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); + if (v->shadow_vmcs) + vmcs_clear(v->shadow_vmcs); + } kvm_cpu_vmxoff(); } @@ -1063,7 +1071,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, * provide that period, so a CPU could write host's record into * guest's memory. */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); @@ -1192,13 +1200,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1206,13 +1214,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1225,9 +1233,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); + wrmsrq(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } @@ -1248,7 +1256,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. */ if (vmx->pt_desc.host.ctl) - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1281,6 +1289,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); @@ -1309,7 +1318,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (vmx->guest_state_loaded) + if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1330,15 +1339,15 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -1347,14 +1356,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - vmx->guest_state_loaded = true; + vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->guest_state_loaded) + if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1362,7 +1371,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -1382,10 +1391,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); - vmx->guest_state_loaded = false; + vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } @@ -1393,8 +1402,8 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->guest_state_loaded) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + if (vmx->vt.guest_state_loaded) + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; } @@ -1402,8 +1411,8 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->guest_state_loaded) - wrmsrl(MSR_KERNEL_GS_BASE, data); + if (vmx->vt.guest_state_loaded) + wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } @@ -1441,8 +1450,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1469,16 +1477,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - - /* - * No indirect branch prediction barrier needed when switching - * the active VMCS within a vCPU, unless IBRS is advertised to - * the vCPU. To minimize the number of IBPBs executed, KVM - * performs IBPB on nested VM-Exit (a single nested transition - * may switch the active VMCS multiple times). - */ - if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) - indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -1514,16 +1512,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -1582,7 +1576,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } bool vmx_get_if_flag(struct kvm_vcpu *vcpu) @@ -1636,7 +1630,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * result in a #GP unless the same write also clears TraceEn. */ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && - ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + (data & RTIT_CTL_TRACEEN) && + data != vmx->pt_desc.guest.ctl) return 1; /* @@ -1701,16 +1696,22 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ - if (to_vmx(vcpu)->exit_reason.enclave_mode) { + if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } + + /* Check that emulation is possible during event vectoring */ + if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + return X86EMUL_CONTINUE; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; @@ -1857,7 +1858,7 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu) return; } - WARN_ON_ONCE(vmx->emulation_required); + WARN_ON_ONCE(vmx->vt.emulation_required); if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -1908,8 +1909,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); vmx_setup_uret_msr(vmx, MSR_TSC_AUX, - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); + guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); /* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new @@ -2062,7 +2063,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -2078,13 +2079,13 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) return 1; msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) @@ -2097,7 +2098,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * sanity checking and refuse to boot. Filter all unsupported * features out. */ - if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) + if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); #endif @@ -2167,7 +2168,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, u64 data) { #ifdef CONFIG_X86_64 - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return (u32)data; #endif return (unsigned long)data; @@ -2178,7 +2179,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated u64 debugctl = 0; if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) + (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && @@ -2282,7 +2283,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) @@ -2384,7 +2385,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * behavior, but it's close enough. */ if (!msr_info->host_initiated && - (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) return 1; @@ -2394,7 +2395,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: @@ -2468,9 +2469,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PERF_CAP_PEBS_MASK) != (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; @@ -2570,11 +2571,39 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; - rdmsrl(msr, allowed); + rdmsrq(msr, allowed); return ctl_opt & allowed; } +#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ +({ \ + int i, r = 0; \ + \ + BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ + BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ + \ + for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ + typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ + typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ + \ + if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ + continue; \ + \ + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ + "entry = %llx (%llx), exit = %llx (%llx)\n", \ + (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ + (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ + \ + if (error_on_inconsistent_vmcs_config) \ + r = -EIO; \ + \ + entry_controls &= ~n_ctrl; \ + exit_controls &= ~x_ctrl; \ + } \ + r; \ +}) + static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2586,7 +2615,6 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _vmentry_control = 0; u64 basic_msr; u64 misc_msr; - int i; /* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. @@ -2690,22 +2718,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control)) return -EIO; - for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { - u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; - u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; - - if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) - continue; - - pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", - _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); - - if (error_on_inconsistent_vmcs_config) - return -EIO; - - _vmentry_control &= ~n_ctrl; - _vmexit_control &= ~x_ctrl; - } + if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, + _vmentry_control, _vmexit_control)) + return -EIO; /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they @@ -2728,7 +2743,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, break; } - rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) @@ -2748,7 +2763,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; - rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + rdmsrq(MSR_IA32_VMX_MISC, misc_msr); vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -2832,7 +2847,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; @@ -3386,7 +3401,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_ept_level(void) @@ -3516,7 +3531,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -3649,7 +3664,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); } void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4177,50 +4192,6 @@ void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - int pi_vec) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of the virtual has already been set in the PIR. - * Send a notification event to deliver the virtual interrupt - * unless the vCPU is the currently running vCPU, i.e. the - * event is being sent from a fastpath VM-Exit handler, in - * which case the PIR will be synced to the vIRR before - * re-entering the guest. - * - * When the target is not the running vCPU, the following - * possibilities emerge: - * - * Case 1: vCPU stays in non-root mode. Sending a notification - * event posts the interrupt to the vCPU. - * - * Case 2: vCPU exits to root mode and is still runnable. The - * PIR will be synced to the vIRR before re-entering the guest. - * Sending a notification event is ok as the host IRQ handler - * will ignore the spurious event. - * - * Case 3: vCPU exits to root mode and is blocked. vcpu_block() - * has already synced PIR to vIRR and never blocks the vCPU if - * the vIRR is not empty. Therefore, a blocked vCPU here does - * not wait for any requested interrupts in PIR, and sending a - * notification event also results in a benign, spurious event. - */ - - if (vcpu != kvm_get_running_vcpu()) - __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return; - } -#endif - /* - * The vCPU isn't in the guest; wake the vCPU in case it is blocking, - * otherwise do nothing as KVM will grab the highest priority pending - * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). - */ - kvm_vcpu_wake_up(vcpu); -} - static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { @@ -4269,7 +4240,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); @@ -4280,20 +4251,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!vcpu->arch.apic->apicv_active) return -1; - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return 0; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return 0; - - /* - * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() - * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is - * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a - * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. - */ - kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); + __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); return 0; } @@ -4373,7 +4331,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { @@ -4590,10 +4548,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, bool __enabled; \ \ if (cpu_has_vmx_##name()) { \ - if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ - __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ - else \ - __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ + __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ __enabled, exiting); \ } \ @@ -4669,8 +4624,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) */ if (cpu_has_vmx_rdtscp()) { bool rdpid_or_rdtscp_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_ENABLE_RDTSCP, @@ -4763,7 +4718,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -4820,7 +4775,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } vmx_write_encls_bitmap(&vmx->vcpu, NULL); @@ -4876,8 +4831,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - __pi_set_sn(&vmx->pi_desc); + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); } void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -5211,6 +5166,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } +static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_fpu.fpstate->xfd && + !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5237,7 +5198,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * point. */ if (is_nm_fault(intr_info)) { - kvm_queue_exception(vcpu, NM_VECTOR); + kvm_queue_exception_p(vcpu, NM_VECTOR, + is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1; } @@ -5644,6 +5606,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5781,11 +5749,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5801,23 +5766,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) - ? PFERR_PRESENT_MASK : 0; - - if (error_code & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5829,7 +5777,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) @@ -5865,11 +5813,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } -static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +/* + * Returns true if emulation is required (due to the vCPU having invalid state + * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the + * current vCPU state. + */ +static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->emulation_required && !vmx->rmode.vm86_active && + if (!vmx->vt.emulation_required) + return false; + + /* + * It is architecturally impossible for emulation to be required when a + * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if + * guest state is invalid and unrestricted guest is disabled, i.e. KVM + * should synthesize VM-Fail instead emulation L2 code. This path is + * only reachable if userspace modifies L2 guest state after KVM has + * performed the nested VM-Enter consistency checks. + */ + if (vmx->nested.nested_run_pending) + return true; + + /* + * KVM only supports emulating exceptions if the vCPU is in Real Mode. + * If emulation is required, KVM can't perform a successful VM-Enter to + * inject the exception. + */ + return !vmx->rmode.vm86_active && (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } @@ -5882,7 +5854,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; - while (vmx->emulation_required && count-- != 0) { + while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5892,7 +5864,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5916,7 +5888,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5959,7 +5931,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } operand; int gpr_index; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6049,7 +6021,7 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) /* * When nested=0, all VMX instruction VM Exits filter here. The handlers - * are overwritten by nested_vmx_setup() when nested=1. + * are overwritten by nested_vmx_hardware_setup() when nested=1. */ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) { @@ -6077,7 +6049,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ - to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } @@ -6175,9 +6147,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); - *reason = vmx->exit_reason.full; + *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason.failed_vmentry)) { + if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -6191,6 +6163,15 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, } } +void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); + else + *error_code = 0; +} + static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { @@ -6202,32 +6183,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u16 pml_idx, pml_tail_index; u64 *pml_buf; - u16 pml_idx; + int i; pml_idx = vmcs_read16(GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ - if (pml_idx == (PML_ENTITY_NUM - 1)) + if (pml_idx == PML_HEAD_INDEX) return; + /* + * PML index always points to the next available PML buffer entity + * unless PML log has just overflowed. + */ + pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; - /* PML index always points to next available PML buffer entity */ - if (pml_idx >= PML_ENTITY_NUM) - pml_idx = 0; - else - pml_idx++; - + /* + * PML log is written backwards: the CPU first writes the entry 511 + * then the entry 510, and so on. + * + * Read the entries in the same order they were written, to ensure that + * the dirty ring is filled in the same order the CPU wrote them. + */ pml_buf = page_address(vmx->pml_pg); - for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + + for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { u64 gpa; - gpa = pml_buf[pml_idx]; + gpa = pml_buf[i]; WARN_ON(gpa & (PAGE_SIZE - 1)); kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } static void vmx_dump_sel(char *name, uint32_t sel) @@ -6456,7 +6445,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; @@ -6512,7 +6501,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * the least awful solution for the userspace case without * risking false positives. */ - if (vmx->emulation_required) { + if (vmx->vt.emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } @@ -6522,7 +6511,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } /* If guest state is invalid, start emulating. L2 is handled above. */ - if (vmx->emulation_required) + if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { @@ -6543,33 +6532,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 0; } - /* - * Note: - * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by - * delivery event since it indicates guest is accessing MMIO. - * The vm-exit can be triggered again after return to guest that - * will cause infinite loop. - */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH && - exit_reason.basic != EXIT_REASON_NOTIFY)) { - int ndata = 3; - - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason.full; - vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); - if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.data[ndata++] = - vmcs_read64(GUEST_PHYSICAL_ADDRESS); - } - vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; - vcpu->run->internal.ndata = ndata; + exit_reason.basic != EXIT_REASON_NOTIFY && + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { + kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); return 0; } @@ -6640,7 +6611,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ - if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; @@ -6694,7 +6665,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6862,11 +6833,32 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) read_unlock(&vcpu->kvm->mmu_lock); } -void vmx_hwapic_isr_update(int max_isr) +void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; + /* + * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI + * is only relevant for if and only if Virtual Interrupt Delivery is + * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's + * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested + * VM-Exit, otherwise L1 with run with a stale SVI. + */ + if (is_guest_mode(vcpu)) { + /* + * KVM is supposed to forward intercepted L2 EOIs to L1 if VID + * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. + * Note, userspace can stuff state while L2 is active; assert + * that VID is disabled if and only if the vCPU is in KVM_RUN + * to avoid false positives if userspace is setting APIC state. + */ + WARN_ON_ONCE(vcpu->wants_to_run && + nested_cpu_has_vid(get_vmcs12(vcpu))); + to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; + return; + } + if (max_isr == -1) max_isr = 0; @@ -6896,38 +6888,24 @@ static void vmx_set_rvi(int vector) } } -void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ - /* - * When running L2, updating RVI is only relevant when - * vmcs12 virtual-interrupt-delivery enabled. - * However, it can be enabled only when L1 also - * intercepts external-interrupts and in that case - * we should not update vmcs02 RVI but instead intercept - * interrupt. Therefore, do nothing when running L2. - */ - if (!is_guest_mode(vcpu)) - vmx_set_rvi(max_irr); -} - int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); + if (pi_test_on(&vt->pi_desc)) { + pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; @@ -6967,14 +6945,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -6985,17 +6955,16 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * - * Do not blindly read xfd_err here, since this exception might - * be caused by L1 interception on a platform which doesn't - * support xfd at all. + * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM + * interception may have been caused by L1 interception. Per the SDM, + * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * - * Do it conditionally upon guest_fpu::xfd. xfd_err matters - * only when xfd contains a non-zero value. - * - * Queuing exception is done in vmx_handle_exit. See comment there. + * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. + * unlike CR2 and DR6, the value is not a payload that is attached to + * the #NM exception. */ - if (vcpu->arch.guest_fpu.fpstate->xfd) - rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + if (is_xfd_nm_fault(vcpu)) + rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) @@ -7032,14 +7001,12 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->emulation_required) + if (to_vt(vcpu)->emulation_required) return; - if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } @@ -7145,13 +7112,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); + case INTR_TYPE_HARD_EXCEPTION: { + u32 error_code = 0; + + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(error_code_field); + + kvm_requeue_exception(vcpu, vector, + idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, + error_code); break; + } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; @@ -7246,7 +7217,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) - vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. @@ -7257,7 +7228,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) - native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } @@ -7270,10 +7241,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; - switch (to_vmx(vcpu)->exit_reason.basic) { + switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -7285,6 +7256,20 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, } } +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) +{ + if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || + !is_nmi(vmx_get_intr_info(vcpu))) + return; + + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); +} + static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned int flags) { @@ -7297,10 +7282,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. + * + * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, + * and is affected by MMIO Stale Data. In such cases mitigation in only + * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mmio_stale_data_clear) && + else if (static_branch_unlikely(&cpu_buf_vm_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); @@ -7320,23 +7309,15 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + vmx->vt.exit_reason.full = 0xdead; goto out; } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); - if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && - is_nmi(vmx_get_intr_info(vcpu))) { - kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); - kvm_after_interrupt(vcpu); - } + vmx_handle_nmi(vcpu); out: guest_state_exit_irqoff(); @@ -7357,15 +7338,15 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ - if (unlikely(vmx->emulation_required)) { + if (unlikely(vmx->vt.emulation_required)) { vmx->fail = 0; - vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; - vmx->exit_reason.failed_vmentry = 1; + vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); - vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); - vmx->exit_intr_info = 0; + vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7407,10 +7388,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7446,8 +7423,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* @@ -7472,7 +7449,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * checking. */ if (vmx->nested.nested_run_pending && - !vmx->exit_reason.failed_vmentry) + !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; @@ -7481,12 +7458,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason.failed_vmentry)) + if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -7518,7 +7495,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - INIT_LIST_HEAD(&vmx->pi_wakeup_list); + INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; @@ -7616,7 +7593,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], - __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; @@ -7643,6 +7620,7 @@ int vmx_vm_init(struct kvm *kvm) case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -7660,9 +7638,23 @@ int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } +static inline bool vmx_ignore_guest_pat(struct kvm *kvm) +{ + /* + * Non-coherent DMA devices need the guest to flush CPU properly. + * In that case it is not possible to map all guest RAM as WB, so + * always trust guest PAT. + */ + return !kvm_arch_has_noncoherent_dma(kvm) && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); +} + u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* @@ -7672,13 +7664,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - /* - * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached. Letting the guest control memory types on Intel - * CPUs may result in unexpected behavior, and so KVM's ABI is to trust - * the guest to behave only as a last resort. - */ - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* Force WB if ignoring guest PAT */ + if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); @@ -7828,12 +7815,8 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); vmx_setup_uret_msrs(vmx); @@ -7841,7 +7824,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); - if (guest_can_use(vcpu, X86_FEATURE_VMX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; @@ -7850,25 +7833,25 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (guest_can_use(vcpu, X86_FEATURE_VMX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && - guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { - bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } if (kvm_cpu_cap_has(X86_FEATURE_XFD)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); if (boot_cpu_has(X86_FEATURE_IBPB)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, @@ -7876,17 +7859,17 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_LC_ENABLED; else @@ -7906,7 +7889,7 @@ static __init u64 vmx_get_perf_capabilities(void) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); @@ -8001,38 +7984,50 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) +static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; + bool imm; + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; + imm = info->src_type == OP_IMM; } else { port = info->dst_val; size = info->src_bytes; + imm = info->dst_type == OP_IMM; } - /* - * If the 'use IO bitmaps' VM-execution control is 0, IO instruction - * VM-exits depend on the 'unconditional IO exiting' VM-execution - * control. - * - * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. - */ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + *exit_qualification = ((unsigned long)port << 16) | (size - 1); + + if (info->intercept == x86_intercept_ins || + info->intercept == x86_intercept_outs) + *exit_qualification |= BIT(4); + + if (info->rep_prefix) + *exit_qualification |= BIT(5); + + if (imm) + *exit_qualification |= BIT(6); + + return nested_vmx_check_io_bitmaps(vcpu, port, size); } int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -8041,26 +8036,34 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qualification = 0; + u32 vm_exit_reason; + u64 exit_insn_len; switch (info->intercept) { - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - * Note, RDPID is hidden behind ENABLE_RDTSCP. - */ case x86_intercept_rdpid: + /* + * RDPID causes #UD if not enabled through secondary execution + * controls (ENABLE_RDTSCP). Note, the implicit MSR access to + * TSC_AUX is NOT subject to interception, i.e. checking only + * the dedicated execution control is architecturally correct. + */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } - break; + return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -8073,7 +8076,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + if (info->intercept == x86_intercept_lldt || + info->intercept == x86_intercept_ltr || + info->intercept == x86_intercept_sldt || + info->intercept == x86_intercept_str) + vm_exit_reason = EXIT_REASON_LDTR_TR; + else + vm_exit_reason = EXIT_REASON_GDTR_IDTR; + /* + * FIXME: Decode the ModR/M to generate the correct exit + * qualification for memory operands. + */ + break; + + case x86_intercept_hlt: + if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: @@ -8086,17 +8106,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || - !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: - break; + return X86EMUL_UNHANDLEABLE; } - return X86EMUL_UNHANDLEABLE; + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) + return X86EMUL_UNHANDLEABLE; + + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, + exit_insn_len); + return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 @@ -8411,7 +8438,7 @@ __init int vmx_hardware_setup(void) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } @@ -8500,6 +8527,8 @@ __init int vmx_hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + else + vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID @@ -8517,9 +8546,6 @@ __init int vmx_hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vt_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8577,6 +8603,27 @@ __init int vmx_hardware_setup(void) kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + /* + * On Intel CPUs that lack self-snoop feature, letting the guest control + * memory types may result in unexpected behavior. So always ignore guest + * PAT on those CPUs and map VM as writeback, not allowing userspace to + * disable the quirk. + * + * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is + * supported, UC is slow enough to cause issues with some older guests (e.g. + * an old version of bochs driver uses ioremap() instead of ioremap_wc() to + * map the video RAM, causing wayland desktop to fail to get started + * correctly). To avoid breaking those older guests that rely on KVM to force + * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the + * safer (for performance) default behavior. + * + * On top of this, non-coherent DMA devices need the guest to flush CPU + * caches properly. This also requires honoring guest PAT, and is forced + * independent of the quirk in vmx_ignore_guest_pat(). + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } @@ -8590,23 +8637,16 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void __vmx_exit(void) +void vmx_exit(void) { allow_smaller_maxphyaddr = false; vmx_cleanup_l1d_flush(); -} -static void vmx_exit(void) -{ - kvm_exit(); - __vmx_exit(); kvm_x86_vendor_exit(); - } -module_exit(vmx_exit); -static int __init vmx_init(void) +int __init vmx_init(void) { int r, cpu; @@ -8650,21 +8690,9 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; - /* - * Common KVM initialization _must_ come last, after this, /dev/kvm is - * exposed to userspace! - */ - r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), - THIS_MODULE); - if (r) - goto err_kvm_init; - return 0; -err_kvm_init: - __vmx_exit(); err_l1d_flush: kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 43f573f6ca46..b5758c33c60f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -11,11 +11,13 @@ #include "capabilities.h" #include "../kvm_cache_regs.h" +#include "pmu_intel.h" #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" #include "../mmu.h" +#include "common.h" #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) @@ -67,47 +69,6 @@ struct pt_desc { struct pt_ctx guest; }; -union vmx_exit_reason { - struct { - u32 basic : 16; - u32 reserved16 : 1; - u32 reserved17 : 1; - u32 reserved18 : 1; - u32 reserved19 : 1; - u32 reserved20 : 1; - u32 reserved21 : 1; - u32 reserved22 : 1; - u32 reserved23 : 1; - u32 reserved24 : 1; - u32 reserved25 : 1; - u32 bus_lock_detected : 1; - u32 enclave_mode : 1; - u32 smi_pending_mtf : 1; - u32 smi_from_vmx_root : 1; - u32 reserved30 : 1; - u32 failed_vmentry : 1; - }; - u32 full; -}; - -struct lbr_desc { - /* Basic info about guest LBR records. */ - struct x86_pmu_lbr records; - - /* - * Emulate LBR feature via passthrough LBR registers when the - * per-vcpu guest LBR event is scheduled on the current pcpu. - * - * The records may be inaccurate if the host reclaims the LBR. - */ - struct perf_event *event; - - /* True if LBRs are marked as not intercepted in the MSR bitmap */ - bool msr_passthrough; -}; - -extern struct x86_pmu_lbr vmx_lbr_caps; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -176,6 +137,7 @@ struct nested_vmx { bool reload_vmcs01_apic_access_page; bool update_vmcs01_cpu_dirty_logging; bool update_vmcs01_apicv_status; + bool update_vmcs01_hwapic_isr; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -247,20 +209,10 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct vcpu_vt vt; u8 fail; u8 x2apic_msr_bitmap_mode; - /* - * If true, host state has been stored in vmx->loaded_vmcs for - * the CPU registers that only need to be switched when transitioning - * to/from the kernel, and the registers have been loaded with guest - * values. If false, host state is loaded in the CPU registers - * and vmx->loaded_vmcs->host_state is invalid. - */ - bool guest_state_loaded; - - unsigned long exit_qualification; - u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -273,7 +225,6 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif @@ -312,15 +263,6 @@ struct vcpu_vmx { } seg[8]; } segment_cache; int vpid; - bool emulation_required; - - union vmx_exit_reason exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Used if this vCPU is waiting for PI notification wakeup. */ - struct list_head pi_wakeup_list; /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -330,14 +272,15 @@ struct vcpu_vmx { bool ple_window_dirty; /* Support for PML */ -#define PML_ENTITY_NUM 512 +#define PML_LOG_NR_ENTRIES 512 + /* PML is written backwards: this is the first entry written by the CPU */ +#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1) + struct page *pml_pg; /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included @@ -374,8 +317,44 @@ struct kvm_vmx { u64 *pid_table; }; -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy); +static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) +{ + return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); +} + +static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt) +{ + return &(container_of(vt, struct vcpu_vmx, vt)->vcpu); +} + +static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu) +{ + return to_vt(vcpu)->exit_reason; +} + +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + return vt->exit_qualification; +} + +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + return vt->exit_intr_info; +} + +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); @@ -660,45 +639,10 @@ static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) -{ - return &to_vmx(vcpu)->lbr_desc; -} - -static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) -{ - return &vcpu_to_lbr_desc(vcpu)->records; -} - -static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) -{ - return !!vcpu_to_lbr_records(vcpu)->nr; -} - void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) - vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - return vmx->exit_qualification; -} - -static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - return vmx->exit_intr_info; -} - struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -756,4 +700,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) vmx->segment_cache.bitmask = 0; } +int vmx_init(void); +void vmx_exit(void); + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h index bba24ed99ee6..cdf8cbb69209 100644 --- a/arch/x86/kvm/vmx/vmx_onhyperv.h +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -3,7 +3,7 @@ #ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__ #define __ARCH_X86_KVM_VMX_ONHYPERV_H__ -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <linux/jump_label.h> diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 633c87e2fd92..96677576c836 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -118,7 +118,7 @@ do_exception: #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ do_exception: * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index a55981c5216e..b4596f651232 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -46,9 +46,7 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); void vmx_migrate_timers(struct kvm_vcpu *vcpu); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu); -void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void vmx_hwapic_isr_update(int max_isr); +void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); @@ -59,6 +57,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); int vmx_get_feature_msr(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#define vmx_complete_emulated_msr kvm_complete_insn_gp u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -74,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); @@ -104,8 +104,11 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr); u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code); + u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); void vmx_write_tsc_offset(struct kvm_vcpu *vcpu); @@ -118,4 +121,49 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); #endif void vmx_setup_mce(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_INTEL_TDX +void tdx_disable_virtualization_cpu(void); +int tdx_vm_init(struct kvm *kvm); +void tdx_mmu_release_hkid(struct kvm *kvm); +void tdx_vm_destroy(struct kvm *kvm); +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); + +int tdx_vcpu_create(struct kvm_vcpu *vcpu); +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void tdx_vcpu_free(struct kvm_vcpu *vcpu); +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void tdx_vcpu_put(struct kvm_vcpu *vcpu); +bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu); +int tdx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath); + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void tdx_inject_nmi(struct kvm_vcpu *vcpu); +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +bool tdx_has_emulated_msr(u32 index); +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +#endif + #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c79a8cc57ba4..a9d992d5652f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,7 +90,6 @@ #include "trace.h" #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 /* * Note, kvm_caps fields should *never* have default values, all fields must be @@ -119,8 +118,6 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; - #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE @@ -229,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_GPL(enable_device_posted_irqs); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -580,7 +580,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(kvm_uret_msrs_list[slot], values->host); + wrmsrq(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -592,10 +592,10 @@ static int kvm_probe_user_return_msr(u32 msr) int ret; preempt_disable(); - ret = rdmsrl_safe(msr, &val); + ret = rdmsrq_safe(msr, &val); if (ret) goto out; - ret = wrmsrl_safe(msr, val); + ret = wrmsrq_safe(msr, val); out: preempt_enable(); return ret; @@ -632,12 +632,21 @@ static void kvm_user_return_msr_cpu_online(void) int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrl_safe(kvm_uret_msrs_list[i], &value); + rdmsrq_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -646,20 +655,25 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + + msrs->values[slot].curr = value; + kvm_user_return_register_notifier(msrs); +} +EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); + static void drop_user_return_notifiers(void) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -802,9 +816,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; @@ -812,13 +826,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); /* - * If the exception is destined for L2 and isn't being reinjected, - * morph it to a VM-Exit if L1 wants to intercept the exception. A - * previously injected exception is not checked because it was checked - * when it was original queued, and re-checking is incorrect if _L1_ - * injected the exception, in which case it's exempt from interception. + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. */ - if (!reinject && is_guest_mode(vcpu) && + if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); @@ -827,28 +838,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On VM-Entry, an exception can be pending if and only - * if event injection was blocked by nested_run_pending. - * In that case, however, vcpu_enter_guest() requests an - * immediate exit, and the guest shouldn't proceed far - * enough to need reinjection. - */ - WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; @@ -889,29 +881,52 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); +} + +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; } +EXPORT_SYMBOL_GPL(kvm_requeue_exception); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -982,16 +997,10 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -1179,16 +1188,16 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != kvm_host.xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); + wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); } if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) - write_pkru(vcpu->arch.pkru); + wrpkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -1202,7 +1211,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) - write_pkru(vcpu->arch.host_pkru); + wrpkru(vcpu->arch.host_pkru); } if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { @@ -1210,9 +1219,9 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != kvm_host.xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, kvm_host.xss); + wrmsrq(MSR_IA32_XSS, kvm_host.xss); } } @@ -1266,7 +1275,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } @@ -1283,18 +1292,6 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (cr4 & cr4_reserved_bits) - return false; - - if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return false; - - return true; -} -EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); - static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { return __kvm_is_valid_cr4(vcpu, cr4) && @@ -1516,10 +1513,10 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; - if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) fixed |= DR6_BUS_LOCK; return fixed; } @@ -1603,7 +1600,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1637,6 +1634,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -1679,7 +1678,7 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, *data = MSR_PLATFORM_INFO_CPUID_FAULT; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(index, data); + rdmsrq_safe(index, data); break; default: return kvm_x86_call(get_feature_msr)(index, data); @@ -1695,20 +1694,20 @@ static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) + if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS)) return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) + if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; - if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return false; if (efer & (EFER_LME | EFER_LMA) && - !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return false; - if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX)) return false; return true; @@ -1850,8 +1849,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; /* @@ -1908,8 +1907,8 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; } @@ -2094,10 +2093,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && - !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) return kvm_handle_invalid_op(vcpu); +emulate_as_nop: pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } @@ -2583,6 +2592,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2640,12 +2652,18 @@ static inline bool kvm_check_tsc_unstable(void) * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, - u64 ns, bool matched) + u64 ns, bool matched, bool user_set_tsc) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (vcpu->arch.guest_tsc_protected) + return; + + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. @@ -2731,8 +2749,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) } } - if (user_value) - kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable @@ -2752,7 +2768,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) matched = true; } - __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } @@ -3130,15 +3146,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, +static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, + struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, - unsigned int offset, - bool force_tsc_unstable) + unsigned int offset) { - struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info hv_clock; unsigned long flags; + memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); + read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); @@ -3158,52 +3176,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, * it is consistent. */ - guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; + guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } + hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); - - if (force_tsc_unstable) - guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; + memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); smp_wmb(); - guest_hv_clock->version = ++vcpu->hv_clock.version; + guest_hv_clock->version = ++hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } -static int kvm_guest_time_update(struct kvm_vcpu *v) +int kvm_guest_time_update(struct kvm_vcpu *v) { + struct pvclock_vcpu_time_info hv_clock = {}; unsigned long flags, tgt_tsc_khz; unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; bool use_master_clock; -#ifdef CONFIG_KVM_XEN - /* - * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless - * explicitly told to use TSC as its clocksource Xen will not set this bit. - * This default behaviour led to bugs in some guest kernels which cause - * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. - */ - bool xen_pvclock_tsc_unstable = - ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; -#endif kernel_ns = 0; host_tsc = 0; @@ -3264,35 +3264,57 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); + &vcpu->pvclock_tsc_shift, + &vcpu->pvclock_tsc_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; - kvm_xen_update_tsc_info(v); } - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + hv_clock.tsc_shift = vcpu->pvclock_tsc_shift; + hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul; + hv_clock.tsc_timestamp = tsc_timestamp; + hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; + hv_clock.flags = 0; if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; + + if (vcpu->pv_time.active) { + /* + * GUEST_STOPPED is only supported by kvmclock, and KVM's + * historic behavior is to only process the request if kvmclock + * is active/enabled. + */ + if (vcpu->pvclock_set_guest_stopped_request) { + hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0); + + hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; + } - vcpu->hv_clock.flags = pvclock_flags; + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); - if (vcpu->pv_time.active) - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + * + * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters! + */ + if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT; + if (vcpu->xen.vcpu_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time), - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0); #endif - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -3558,7 +3580,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -3747,7 +3769,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u32 msr = msr_info->index; u64 data = msr_info->data; - if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + /* + * Do not allow host-initiated writes to trigger the Xen hypercall + * page setup; it could incur locking paths which are not expected + * if userspace sets the MSR in an unusual location. + */ + if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && + !msr_info->host_initiated) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { @@ -3767,13 +3795,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated || - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) return KVM_MSR_RET_UNSUPPORTED; vcpu->arch.arch_capabilities = data; break; case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated || - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return KVM_MSR_RET_UNSUPPORTED; if (data & ~kvm_caps.supported_perf_cap) @@ -3797,11 +3825,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((!guest_has_pred_cmd_msr(vcpu))) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB)) reserved_bits |= PRED_CMD_IBPB; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)) reserved_bits |= PRED_CMD_SBPB; } @@ -3817,12 +3845,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, data); + wrmsrq(MSR_IA32_PRED_CMD, data); break; } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)) return 1; if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) @@ -3830,7 +3858,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); break; case MSR_EFER: return set_efer(vcpu, msr_info); @@ -3873,7 +3901,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: - if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); @@ -3900,10 +3928,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3920,7 +3948,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); - } else { + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; @@ -3938,7 +3966,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -4077,12 +4105,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; @@ -4101,7 +4129,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) @@ -4111,7 +4139,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) @@ -4226,12 +4254,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.perf_capabilities; break; @@ -4432,12 +4460,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; @@ -4456,14 +4484,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.xfd_err; @@ -4545,6 +4573,20 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static u64 kvm_get_allowed_disable_exits(void) +{ + u64 r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } + return r; +} + #ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) @@ -4573,6 +4615,11 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4681,21 +4728,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: - r = KVM_X86_DISABLE_EXITS_PAUSE; - - if (!mitigate_smt_rsb) { - r |= KVM_X86_DISABLE_EXITS_HLT | - KVM_X86_DISABLE_EXITS_CSTATE; - - if (kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; - } + r = kvm_get_allowed_disable_exits(); break; case KVM_CAP_X86_SMM: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -4716,6 +4755,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; + if (kvm) + r = kvm->max_vcpus; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_IDS; @@ -4771,7 +4812,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: - r = KVM_X86_VALID_QUIRKS; + r = kvm_caps.supported_quirks; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; @@ -4952,6 +4993,8 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -4974,6 +5017,19 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_call(vcpu_load)(vcpu, cpu); + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } + /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -4994,7 +5050,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) @@ -5093,6 +5150,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); @@ -5103,6 +5163,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, { int r; + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + r = kvm_apic_set_state(vcpu, s); if (r) return r; @@ -5733,8 +5796,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); - kvm->arch.user_set_tsc = true; - __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); r = 0; @@ -5822,9 +5884,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; - if (vcpu->arch.pv_cpuid.enforce) - kvm_update_pv_runtime(vcpu); - return 0; default: return -EINVAL; @@ -6284,6 +6343,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; } @@ -6471,7 +6536,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) @@ -6501,11 +6566,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS2: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + if (cap->args[0] & ~kvm_caps.supported_quirks) break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -6542,30 +6607,32 @@ split_irqchip_unlock: break; case KVM_CAP_X86_DISABLE_EXITS: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) + if (cap->args[0] & ~kvm_get_allowed_disable_exits()) break; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) - kvm->arch.pause_in_guest = true; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + goto disable_exits_unlock; #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." - if (!mitigate_smt_rsb) { - if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && - (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) - pr_warn_once(SMT_RSB_MSG); - - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; - } + if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && + cpu_smt_possible() && + (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + pr_warn_once(SMT_RSB_MSG); + if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) + kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) + kvm->arch.mwait_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) + kvm->arch.hlt_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; +disable_exits_unlock: + mutex_unlock(&kvm->lock); break; case KVM_CAP_MSR_PLATFORM_INFO: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; @@ -6914,23 +6981,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; - int ret = 0; - - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time.active) - continue; - ret = kvm_set_guest_paused(vcpu); - if (ret) { - kvm_err("Failed to pause guest VCPU%d: %d\n", - vcpu->vcpu_id, ret); - break; - } - } - mutex_unlock(&kvm->lock); + /* + * Ignore the return, marking the guest paused only "fails" if the vCPU + * isn't using kvmclock; continuing on is correct and desirable. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + (void)kvm_set_guest_paused(vcpu); - return ret ? NOTIFY_BAD : NOTIFY_DONE; + return NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) @@ -7285,14 +7344,13 @@ set_pit2_out: r = READ_ONCE(kvm->arch.default_tsc_khz); goto out; } - case KVM_MEMORY_ENCRYPT_OP: { + case KVM_MEMORY_ENCRYPT_OP: r = -ENOTTY; if (!kvm_x86_ops.mem_enc_ioctl) goto out; r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; @@ -7986,7 +8044,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; @@ -8511,17 +8569,17 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); } static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); } static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt) @@ -8813,6 +8871,28 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); +void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + u32 reason, intr_info, error_code; + struct kvm_run *run = vcpu->run; + u64 info1, info2; + int ndata = 0; + + kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2, + &intr_info, &error_code); + + run->internal.data[ndata++] = info2; + run->internal.data[ndata++] = reason; + run->internal.data[ndata++] = info1; + run->internal.data[ndata++] = gpa; + run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + run->internal.ndata = ndata; +} +EXPORT_SYMBOL_GPL(kvm_prepare_event_vectoring_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { struct kvm *kvm = vcpu->kvm; @@ -9085,6 +9165,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) return 1; + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) + return 1; + + if (r == X86EMUL_UNHANDLEABLE_VECTORING) { + kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa); + return 0; + } + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); return handle_emulation_failure(vcpu, emulation_type); } @@ -9293,7 +9382,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -9318,7 +9407,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -9331,7 +9420,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -9360,7 +9449,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; @@ -9693,7 +9782,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * with an exception. PAT[0] is set to WB on RESET and also by the * kernel, i.e. failure indicates a kernel bug or broken firmware. */ - if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("host PAT[0] is not WB\n"); return -EIO; @@ -9726,21 +9815,26 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; - rdmsrl_safe(MSR_EFER, &kvm_host.efer); + rdmsrq_safe(MSR_EFER, &kvm_host.efer); if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, kvm_host.xss); + rdmsrq(MSR_IA32_XSS, kvm_host.xss); kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + kvm_ops_update(ops); for_each_online_cpu(cpu) { @@ -9770,13 +9864,13 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -9979,16 +10073,21 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); - ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } -unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl) +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, + int (*complete_hypercall)(struct kvm_vcpu *)) { unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); + + ++vcpu->stat.hypercalls; trace_kvm_hypercall(nr, a0, a1, a2, a3); @@ -10041,7 +10140,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, u64 gpa = a0, npages = a1, attrs = a2; ret = -KVM_ENOSYS; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) break; if (!PAGE_ALIGNED(gpa) || !npages || @@ -10052,6 +10151,13 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; @@ -10060,8 +10166,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); - vcpu->arch.complete_userspace_io = complete_hypercall_exit; - /* stat is incremented on completion. */ + vcpu->arch.complete_userspace_io = complete_hypercall; return 0; } default: @@ -10070,41 +10175,21 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, } out: - ++vcpu->stat.hypercalls; - return ret; + vcpu->run->hypercall.ret = ret; + return 1; } -EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall); +EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; - int cpl; - if (kvm_xen_hypercall_enabled(vcpu->kvm)) return kvm_xen_hypercall(vcpu); if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); - op_64_bit = is_64_bit_hypercall(vcpu); - cpl = kvm_x86_call(get_cpl)(vcpu); - - ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); - if (nr == KVM_HC_MAP_GPA_RANGE && !ret) - /* MAP_GPA tosses the request to the user space. */ - return 0; - - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); - - return kvm_skip_emulated_instruction(vcpu); + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), + complete_hypercall_exit); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -10633,6 +10718,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; kvm_x86_call(sync_pir_to_irr)(vcpu); @@ -10945,18 +11031,24 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) switch_fpu_return(); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { + set_debugreg(DR7_FIXED_1, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); } + vcpu->arch.host_debugctl = get_debugctlmsr(); + guest_timing_enter_irqoff(); for (;;) { @@ -10994,6 +11086,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); @@ -11026,7 +11119,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_call(handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* * Consume any pending interrupts, including the possible source of @@ -11064,7 +11157,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -11097,7 +11191,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11106,9 +11200,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11146,10 +11237,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11207,9 +11300,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -11285,10 +11376,9 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) - vcpu->arch.pv.pv_unhalted = false; - else - vcpu->arch.mp_state = state; + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); return 1; } else { vcpu->run->exit_reason = reason; @@ -11461,8 +11551,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u64 sync_valid_fields; int r; + r = kvm_mmu_post_init_vm(vcpu->kvm); + if (r) + return r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); kvm_run->flags = 0; @@ -11502,8 +11597,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || - (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -11561,7 +11657,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); @@ -11749,6 +11845,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11762,6 +11860,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); @@ -11804,10 +11904,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else - vcpu->arch.mp_state = mp_state->mp_state; + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -11934,7 +12034,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -12237,9 +12337,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) @@ -12276,9 +12376,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_emulate_ctxt; } - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); - kvm_async_pf_hash_reset(vcpu); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { @@ -12301,6 +12398,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_xen_init_vcpu(vcpu); vcpu_load(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu); @@ -12346,10 +12444,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int idx; + int idx, cpu; + + kvm_clear_async_pf_completion_queue(vcpu); + kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); @@ -12649,6 +12753,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -12678,6 +12783,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Decided by the vendor code for other VM types. */ kvm->arch.pre_fault_allowed = type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; ret = kvm_page_track_init(kvm); if (ret) @@ -12731,6 +12837,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); } + once_init(&kvm->arch.nx_once); return 0; out_uninit_mmu: @@ -12740,36 +12847,6 @@ out: return ret; } -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - return kvm_mmu_post_init_vm(kvm); -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_unload_vcpu_mmus(struct kvm *kvm) -{ - unsigned long i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_unload_vcpu_mmu(vcpu); - } -} - -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_pit(kvm); -} - /** * __x86_set_memory_region: Setup KVM internal memory slot * @@ -12800,7 +12877,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; - /* Called with kvm->slots_lock held. */ + lockdep_assert_held(&kvm->slots_lock); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) return ERR_PTR_USR(-EINVAL); @@ -12833,7 +12911,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, m.guest_phys_addr = gpa; m.userspace_addr = hva; m.memory_size = size; - r = __kvm_set_memory_region(kvm, &m); + r = kvm_set_internal_memslot(kvm, &m); if (r < 0) return ERR_PTR_USR(r); } @@ -12847,7 +12925,19 @@ EXPORT_SYMBOL_GPL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { + /* + * Stop all background workers and kthreads before destroying vCPUs, as + * iterating over vCPUs in a different task while vCPUs are being freed + * is unsafe, i.e. will lead to use-after-free. The PIT also needs to + * be stopped before IRQ routing is freed. + */ + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); + + kvm_free_pit(kvm); + kvm_mmu_pre_destroy_vm(kvm); + static_call_cond(kvm_x86_vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -12866,18 +12956,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - kvm_unload_vcpu_mmus(kvm); - kvm_x86_call(vm_destroy)(kvm); + kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); + kvm_x86_call(vm_destroy)(kvm); } static void memslot_rmap_free(struct kvm_memory_slot *slot) @@ -12934,7 +13023,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The - * old arrays will be freed by __kvm_set_memory_region() if installing + * old arrays will be freed by kvm_set_memory_region() if installing * the new memslot is successful. */ memset(&slot->arch, 0, sizeof(slot->arch)); @@ -13027,6 +13116,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) return -EINVAL; + if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1)) + return -EINVAL; + return kvm_alloc_memslot_metadata(kvm, new); } @@ -13043,7 +13135,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -13115,7 +13207,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { @@ -13368,8 +13460,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (vcpu->arch.apf.send_user_only && - kvm_x86_call(get_cpl)(vcpu) == 0) + if (!vcpu->arch.apf.send_always && + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; if (is_guest_mode(vcpu)) { @@ -13459,7 +13551,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) @@ -13504,8 +13596,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first * (or last) non-coherent device is (un)registered to so that new SPTEs * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. */ - if (kvm_mmu_may_ignore_guest_pat()) + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } @@ -13529,25 +13623,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13557,9 +13653,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13567,12 +13663,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } @@ -13585,7 +13687,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); @@ -13629,12 +13732,12 @@ int kvm_spec_ctrl_test_value(u64 value) local_irq_save(flags); - if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; - else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) ret = 1; else - wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); local_irq_restore(flags); @@ -13973,6 +14076,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ec623d23d13d..832f0faf4779 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,6 +10,8 @@ #include "kvm_emulate.h" #include "cpuid.h" +#define KVM_MAX_MCE_BANKS 32 + struct kvm_caps { /* control of guest tsc rate supported? */ bool has_tsc_control; @@ -32,6 +34,9 @@ struct kvm_caps { u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + + u64 supported_quirks; + u64 inapplicable_quirks; }; struct kvm_host_values { @@ -116,11 +121,36 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) kvm_x86_ops.nested_ops->leave_nested(vcpu); } +/* + * If IBRS is advertised to the vCPU, KVM must flush the indirect branch + * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in + * this case) to provide separate predictor modes. Bare metal isolates the host + * from the guest, but doesn't isolate different guests from one another (in + * this case L1 and L2). The exception is if bare metal supports same mode IBRS, + * which offers protection within the same mode, and hence protects L1 from L2. + */ +static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) +{ + if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE)) + return; + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS)) + indirect_branch_prediction_barrier(); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; } +static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) +{ + vcpu->arch.mp_state = mp_state; + if (mp_state == KVM_MP_STATE_RUNNABLE) + vcpu->arch.pv.pv_unhalted = false; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || @@ -362,6 +392,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); +int kvm_guest_time_update(struct kvm_vcpu *v); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, @@ -550,7 +581,6 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); @@ -577,6 +607,11 @@ enum kvm_msr_access { #define KVM_MSR_RET_UNSUPPORTED 2 #define KVM_MSR_RET_FILTERED 3 +static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return !(cr4 & vcpu->arch.cr4_guest_rsvd_bits); +} + #define __cr4_reserved_bits(__cpu_has, __c) \ ({ \ u64 __reserved_bits = CR4_RESERVED_BITS; \ @@ -612,4 +647,24 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in); +static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr) +{ + return kvm->arch.hypercall_exit_enabled & BIT(hc_nr); +} + +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, + int (*complete_hypercall)(struct kvm_vcpu *)); + +#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \ +({ \ + int __ret; \ + __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \ + \ + if (__ret > 0) \ + __ret = complete_hypercall(_vcpu); \ + __ret; \ +}) + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a909b817b9c0..9b029bb29a16 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, + struct pvclock_vcpu_time_info *hv_clock, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) +{ + unsigned long flags; + int r; + + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); + + r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); + if (r) + return r; + + read_lock_irqsave(&gpc->lock, flags); + } + + memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); + read_unlock_irqrestore(&gpc->lock, flags); + + /* + * Sanity check TSC shift+multiplier to verify the guest's view of time + * is more or less consistent. + */ + if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift || + hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul) + return -EINVAL; + + return 0; +} + static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, bool linux_wa) { + struct kvm_vcpu_xen *xen = &vcpu->arch.xen; int64_t kernel_now, delta; uint64_t guest_now; + int r = -EOPNOTSUPP; /* * The guest provides the requested timeout in absolute nanoseconds @@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, * the absolute CLOCK_MONOTONIC time at which the timer should * fire. */ - if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && - static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + do { + struct pvclock_vcpu_time_info hv_clock; uint64_t host_tsc, guest_tsc; + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) || + !vcpu->kvm->arch.use_master_clock) + break; + + /* + * If both Xen PV clocks are active, arbitrarily try to use the + * compat clock first, but also try to use the non-compat clock + * if the compat clock is unusable. The two PV clocks hold the + * same information, but it's possible one (or both) is stale + * and/or currently unreachable. + */ + if (xen->vcpu_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (r && xen->vcpu_time_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0); + if (r) + break; + if (!IS_ENABLED(CONFIG_64BIT) || !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { /* @@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, /* Calculate the guest kvmclock as the guest would do it. */ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); - guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, - guest_tsc); - } else { + guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc); + } while (0); + + if (r) { /* * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. * @@ -1280,10 +1335,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) * Note, truncation is a non-issue as 'lm' is guaranteed to be * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. */ - hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 - : kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; + hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64 + : kvm->arch.xen.hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64 + : kvm->arch.xen.hvm_config.blob_size_32; u8 *page; int ret; @@ -1324,15 +1379,24 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; + /* + * Restrict the MSR to the range that is unofficially reserved for + * synthetic, virtualization-defined MSRs, e.g. to prevent confusing + * KVM by colliding with a real MSR that requires special handling. + */ + if (xhc->msr && + (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX)) + return -EINVAL; + mutex_lock(&kvm->arch.xen.xen_lock); - if (xhc->msr && !kvm->arch.xen_hvm_config.msr) + if (xhc->msr && !kvm->arch.xen.hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); - else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) + else if (!xhc->msr && kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); - old_flags = kvm->arch.xen_hvm_config.flags; - memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); + old_flags = kvm->arch.xen.hvm_config.flags; + memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc)); mutex_unlock(&kvm->arch.xen.xen_lock); @@ -1413,7 +1477,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, int i; if (!lapic_in_kernel(vcpu) || - !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) + !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; if (IS_ENABLED(CONFIG_64BIT) && !longmode) { @@ -1480,7 +1544,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, @@ -1489,9 +1553,9 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, kvm_vcpu_halt(vcpu); if (sched_poll.timeout) - del_timer(&vcpu->arch.xen.poll_timer); + timer_delete(&vcpu->arch.xen.poll_timer); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } vcpu->arch.xen.poll_evtchn = 0; @@ -1507,7 +1571,8 @@ out: static void cancel_evtchn_poll(struct timer_list *t) { - struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, + arch.xen.poll_timer); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -2225,8 +2290,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; + hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); @@ -2244,30 +2309,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); - del_timer_sync(&vcpu->arch.xen.poll_timer); -} - -void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *entry; - u32 function; - - if (!vcpu->arch.xen.cpuid.base) - return; - - function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); - if (function > vcpu->arch.xen.cpuid.limit) - return; - - entry = kvm_find_cpuid_entry_index(vcpu, function, 1); - if (entry) { - entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; - entry->edx = vcpu->arch.hv_clock.tsc_shift; - } - - entry = kvm_find_cpuid_entry_index(vcpu, function, 2); - if (entry) - entry->eax = vcpu->arch.hw_tsc_khz; + timer_delete_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) @@ -2291,6 +2333,6 @@ void kvm_xen_destroy_vm(struct kvm *kvm) } idr_destroy(&kvm->arch.xen.evtchn_ports); - if (kvm->arch.xen_hvm_config.msr) + if (kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index f5841d9000ae..59e6128a7bd3 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -9,6 +9,7 @@ #ifndef __ARCH_X86_KVM_XEN_H__ #define __ARCH_X86_KVM_XEN_H__ +#include <asm/xen/cpuid.h> #include <asm/xen/hypervisor.h> #ifdef CONFIG_KVM_XEN @@ -35,7 +36,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); -void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) { @@ -50,16 +50,32 @@ static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) kvm_xen_inject_vcpu_vector(vcpu); } +static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.cpuid.base && + function <= vcpu->arch.xen.cpuid.limit && + function == (vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3)); +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && - kvm->arch.xen_hvm_config.msr; + kvm->arch.xen.hvm_config.msr; +} + +static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr) +{ + if (!static_branch_unlikely(&kvm_xen_enabled.key)) + return false; + + return msr && msr == kvm->arch.xen.hvm_config.msr; } static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && - (kvm->arch.xen_hvm_config.flags & + (kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); } @@ -124,6 +140,11 @@ static inline bool kvm_xen_msr_enabled(struct kvm *kvm) return false; } +static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr) +{ + return false; +} + static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) { return false; @@ -157,8 +178,9 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) return false; } -static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function) { + return false; } #endif |