From aa8d1f48d353b0469bff357183ee9df137d15ef0 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:10:43 +0800 Subject: KVM: x86/mmu: Introduce a quirk to control memslot zap behavior Introduce the quirk KVM_X86_QUIRK_SLOT_ZAP_ALL to allow users to select KVM's behavior when a memslot is moved or deleted for KVM_X86_DEFAULT_VM VMs. Make sure KVM behave as if the quirk is always disabled for non-KVM_X86_DEFAULT_VM VMs. The KVM_X86_QUIRK_SLOT_ZAP_ALL quirk offers two behavior options: - when enabled: Invalidate/zap all SPTEs ("zap-all"), - when disabled: Precisely zap only the leaf SPTEs within the range of the moving/deleting memory slot ("zap-slot-leafs-only"). "zap-all" is today's KVM behavior to work around a bug [1] where the changing the zapping behavior of memslot move/deletion would cause VM instability for VMs with an Nvidia GPU assigned; while "zap-slot-leafs-only" allows for more precise zapping of SPTEs within the memory slot range, improving performance in certain scenarios [2], and meeting the functional requirements for TDX. Previous attempts to select "zap-slot-leafs-only" include a per-VM capability approach [3] (which was not preferred because the root cause of the bug remained unidentified) and a per-memslot flag approach [4]. Sean and Paolo finally recommended the implementation of this quirk and explained that it's the least bad option [5]. By default, the quirk is enabled on KVM_X86_DEFAULT_VM VMs to use "zap-all". Users have the option to disable the quirk to select "zap-slot-leafs-only" for specific KVM_X86_DEFAULT_VM VMs that are unaffected by this bug. For non-KVM_X86_DEFAULT_VM VMs, the "zap-slot-leafs-only" behavior is always selected without user's opt-in, regardless of if the user opts for "zap-all". This is because it is assumed until proven otherwise that non- KVM_X86_DEFAULT_VM VMs will not be exposed to the bug [1], and most importantly, it's because TDX must have "zap-slot-leafs-only" always selected. In TDX's case a memslot's GPA range can be a mixture of "private" or "shared" memory. Shared is roughly analogous to how EPT is handled for normal VMs, but private GPAs need lots of special treatment: 1) "zap-all" would require to zap private root page or non-leaf entries or at least leaf-entries beyond the deleting memslot scope. However, TDX demands that the root page of the private page table remains unchanged, with leaf entries being zapped before non-leaf entries, and any dropped private guest pages must be re-accepted by the guest. 2) if "zap-all" zaps only shared page tables, it would result in private pages still being mapped when the memslot is gone. This may affect even other processes if later the gmem fd was whole punched, causing the pages being freed on the host while still mapped in the TD, because there's no pgoff to the gfn information to zap the private page table after memslot is gone. So, simply go "zap-slot-leafs-only" as if the quirk is always disabled for non-KVM_X86_DEFAULT_VM VMs to avoid manual opt-in for every VM type [6] or complicating quirk disabling interface (current quirk disabling interface is limited, no way to query quirks, or force them to be disabled). Add a new function kvm_mmu_zap_memslot_leafs() to implement "zap-slot-leafs-only". This function does not call kvm_unmap_gfn_range(), bypassing special handling to APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, as 1) The APIC_ACCESS_PAGE_PRIVATE_MEMSLOT cannot be created by users, nor can it be moved. It is only deleted by KVM when APICv is permanently inhibited. 2) kvm_vcpu_reload_apic_access_page() effectively does nothing when APIC_ACCESS_PAGE_PRIVATE_MEMSLOT is deleted. 3) Avoid making all cpus request of KVM_REQ_APIC_PAGE_RELOAD can save on costly IPIs. Suggested-by: Kai Huang Suggested-by: Sean Christopherson Suggested-by: Paolo Bonzini Link: https://patchwork.kernel.org/project/kvm/patch/20190205210137.1377-11-sean.j.christopherson@intel.com [1] Link: https://patchwork.kernel.org/project/kvm/patch/20190205210137.1377-11-sean.j.christopherson@intel.com/#25054908 [2] Link: https://lore.kernel.org/kvm/20200713190649.GE29725@linux.intel.com/T/#mabc0119583dacf621025e9d873c85f4fbaa66d5c [3] Link: https://lore.kernel.org/all/20240515005952.3410568-3-rick.p.edgecombe@intel.com [4] Link: https://lore.kernel.org/all/7df9032d-83e4-46a1-ab29-6c7973a2ab0b@redhat.com [5] Link: https://lore.kernel.org/all/ZnGa550k46ow2N3L@google.com [6] Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Yan Zhao Message-ID: <20240703021043.13881-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78..e4fc362ba3da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2345,7 +2345,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ + KVM_X86_QUIRK_SLOT_ZAP_ALL) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index bf57a824f722..a8debbf2f702 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) +#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 -- cgit v1.2.3 From 1c450ffef589383f743d798666703687fc2e582b Mon Sep 17 00:00:00 2001 From: Tao Su Date: Mon, 19 Aug 2024 14:23:27 +0800 Subject: KVM: x86: Advertise AVX10.1 CPUID to userspace Advertise AVX10.1 related CPUIDs, i.e. report AVX10 support bit via CPUID.(EAX=07H, ECX=01H):EDX[bit 19] and new CPUID leaf 0x24H so that guest OS and applications can query the AVX10.1 CPUIDs directly. Intel AVX10 represents the first major new vector ISA since the introduction of Intel AVX512, which will establish a common, converged vector instruction set across all Intel architectures[1]. AVX10.1 is an early version of AVX10, that enumerates the Intel AVX512 instruction set at 128, 256, and 512 bits which is enabled on Granite Rapids. I.e., AVX10.1 is only a new CPUID enumeration with no new functionality. New features, e.g. Embedded Rounding and Suppress All Exceptions (SAE) will be introduced in AVX10.2. Advertising AVX10.1 is safe because there is nothing to enable for AVX10.1, i.e. it's purely a new way to enumerate support, thus there will never be anything for the kernel to enable. Note just the CPUID checking is changed when using AVX512 related instructions, e.g. if using one AVX512 instruction needs to check (AVX512 AND AVX512DQ), it can check ((AVX512 AND AVX512DQ) OR AVX10.1) after checking XCR0[7:5]. The versions of AVX10 are expected to be inclusive, e.g. version N+1 is a superset of version N. Per the spec, the version can never be 0, just advertise AVX10.1 if it's supported in hardware. Moreover, advertising AVX10_{128,256,512} needs to land in the same commit as advertising basic AVX10.1 support, otherwise KVM would advertise an impossible CPU model. E.g. a CPU with AVX512 but not AVX10.1/512 is impossible per the SDM. As more and more AVX related CPUIDs are added (it would have resulted in around 40-50 CPUID flags when developing AVX10), the versioning approach is introduced. But incrementing version numbers are bad for virtualization. E.g. if AVX10.2 has a feature that shouldn't be enumerated to guests for whatever reason, then KVM can't enumerate any "later" features either, because the only way to hide the problematic AVX10.2 feature is to set the version to AVX10.1 or lower[2]. But most AVX features are just passed through and don't have virtualization controls, so AVX10 should not be problematic in practice, so long as Intel honors their promise that future versions will be supersets of past versions. [1] https://cdrdv2.intel.com/v1/dl/getContent/784267 [2] https://lore.kernel.org/all/Zkz5Ak0PQlAN8DxK@google.com/ Suggested-by: Sean Christopherson Signed-off-by: Tao Su Link: https://lore.kernel.org/r/20240819062327.3269720-1-tao1.su@linux.intel.com [sean: minor changelog tweaks] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpuid.h | 1 + arch/x86/kvm/cpuid.c | 30 ++++++++++++++++++++++++++++-- arch/x86/kvm/reverse_cpuid.h | 8 ++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 6b122a31da06..aa21c105eef1 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) case 0x1d: case 0x1e: case 0x1f: + case 0x24: case 0x8000001d: return true; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2617be544480..41786b834b16 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -705,7 +705,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | - F(AMX_COMPLEX) + F(AMX_COMPLEX) | F(AVX10) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, @@ -721,6 +721,10 @@ void kvm_set_cpu_caps(void) SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); + kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, + F(AVX10_128) | F(AVX10_256) | F(AVX10_512) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -949,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ - entry->eax = min(entry->eax, 0x1fU); + entry->eax = min(entry->eax, 0x24U); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); @@ -1174,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } break; + case 0x24: { + u8 avx10_version; + + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * The AVX10 version is encoded in EBX[7:0]. Note, the version + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the + * version needs to be captured before overriding EBX features! + */ + avx10_version = min_t(u8, entry->ebx & 0xff, 1); + cpuid_entry_override(entry, CPUID_24_0_EBX); + entry->ebx |= avx10_version; + + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 2f4e155080ba..0d17d6b70639 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs { CPUID_8000_0007_EDX, CPUID_8000_0022_EAX, CPUID_7_2_EDX, + CPUID_24_0_EBX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) @@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ +#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) +#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) +#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, }; /* -- cgit v1.2.3 From e7e80b66fb242a63cdfc3d3534cff62a5e293185 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:09 -0700 Subject: x86/cpu: KVM: Add common defines for architectural memory types (PAT, MTRRs, etc.) Add defines for the architectural memory types that can be shoved into various MSRs and registers, e.g. MTRRs, PAT, VMX capabilities MSRs, EPTPs, etc. While most MSRs/registers support only a subset of all memory types, the values themselves are architectural and identical across all users. Leave the goofy MTRR_TYPE_* definitions as-is since they are in a uapi header, but add compile-time assertions to connect the dots (and sanity check that the msr-index.h values didn't get fat-fingered). Keep the VMX_EPTP_MT_* defines so that it's slightly more obvious that the EPTP holds a single memory type in 3 of its 64 bits; those bits just happen to be 2:0, i.e. don't need to be shifted. Opportunistically use X86_MEMTYPE_WB instead of an open coded '6' in setup_vmcs_config(). No functional change intended. Reviewed-by: Thomas Gleixner Acked-by: Kai Huang Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20240605231918.2915961-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 15 ++++++++++++++- arch/x86/include/asm/vmx.h | 5 +++-- arch/x86/kernel/cpu/mtrr/mtrr.c | 6 ++++++ arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/mm/pat/memtype.c | 33 ++++++++++++--------------------- 6 files changed, 37 insertions(+), 26 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 82c6a4d350e0..bb43a07616e6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,6 +36,20 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) +/* + * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc. + * Most MSRs support/allow only a subset of memory types, but the values + * themselves are common across all relevant MSRs. + */ +#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */ +#define X86_MEMTYPE_WC 1ull /* Write Combining */ +/* RESERVED 2 */ +/* RESERVED 3 */ +#define X86_MEMTYPE_WT 4ull /* Write Through */ +#define X86_MEMTYPE_WP 5ull /* Write Protected */ +#define X86_MEMTYPE_WB 6ull /* Write Back */ +#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */ + /* FRED MSRs */ #define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ #define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ @@ -1163,7 +1177,6 @@ #define VMX_BASIC_64 0x0001000000000000LLU #define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU -#define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU /* Resctrl MSRs: */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d77a31039f24..e531d8d80a11 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -508,9 +508,10 @@ enum vmcs_field { #define VMX_EPTP_PWL_4 0x18ull #define VMX_EPTP_PWL_5 0x20ull #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) +/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */ #define VMX_EPTP_MT_MASK 0x7ull -#define VMX_EPTP_MT_WB 0x6ull -#define VMX_EPTP_MT_UC 0x0ull +#define VMX_EPTP_MT_WB X86_MEMTYPE_WB +#define VMX_EPTP_MT_UC X86_MEMTYPE_UC #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 2a2fc14955cd..989d368be04f 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -55,6 +55,12 @@ #include "mtrr.h" +static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE); +static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB); +static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH); +static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT); +static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK); + /* arch_phys_wc_add returns an MTRR register index plus this offset. */ #define MTRR_TO_PHYS_WC_OFFSET 1000 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2392a7ef254d..504fe5ffd47b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7070,7 +7070,7 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + (X86_MEMTYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476..a4d077db04cf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2747,7 +2747,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) + if (((vmx_msr_high >> 18) & 15) != X86_MEMTYPE_WB) return -EIO; rdmsrl(MSR_IA32_VMX_MISC, misc_msr); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index bdc2a240c2aa..15b888ebaf17 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -176,15 +176,6 @@ static inline void set_page_memtype(struct page *pg, } #endif -enum { - PAT_UC = 0, /* uncached */ - PAT_WC = 1, /* Write combining */ - PAT_WT = 4, /* Write Through */ - PAT_WP = 5, /* Write Protected */ - PAT_WB = 6, /* Write Back (default) */ - PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ -}; - #define CM(c) (_PAGE_CACHE_MODE_ ## c) static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, @@ -194,13 +185,13 @@ static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, char *cache_mode; switch (pat_val) { - case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; - case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; - case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; - case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; - case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; - case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; - default: cache = CM(WB); cache_mode = "WB "; break; + case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break; + case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break; + case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break; + case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break; + case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break; + case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; } memcpy(msg, cache_mode, 4); @@ -257,11 +248,11 @@ void pat_cpu_init(void) void __init pat_bp_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; -#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ - (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ - ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ - ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ - ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) +#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) if (!IS_ENABLED(CONFIG_X86_PAT)) -- cgit v1.2.3 From beb2e446046f8dd96bbeed3267b5f26bf1926ef9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:10 -0700 Subject: x86/cpu: KVM: Move macro to encode PAT value to common header Move pat/memtype.c's PAT() macro to msr-index.h as PAT_VALUE(), and use it in KVM to define the default (Power-On / RESET) PAT value instead of open coding an inscrutable magic number. No functional change intended. Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240605231918.2915961-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 6 ++++++ arch/x86/kvm/x86.h | 3 ++- arch/x86/mm/pat/memtype.c | 13 +++---------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index bb43a07616e6..f6eb3460e169 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -377,6 +377,12 @@ #define MSR_IA32_CR_PAT 0x00000277 +#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \ + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) + #define MSR_IA32_DEBUGCTLMSR 0x000001d9 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db #define MSR_IA32_LASTBRANCHTOIP 0x000001dc diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 50596f6f8320..5585c2ef147a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -103,7 +103,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val, return max(val, min); } -#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +#define MSR_IA32_CR_PAT_DEFAULT \ + PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 15b888ebaf17..6c4e29457c10 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -248,12 +248,6 @@ void pat_cpu_init(void) void __init pat_bp_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; -#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ - ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ - (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ - (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ - (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) - if (!IS_ENABLED(CONFIG_X86_PAT)) pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); @@ -284,7 +278,7 @@ void __init pat_bp_init(void) * NOTE: When WC or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ - pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); + pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); } /* @@ -319,7 +313,7 @@ void __init pat_bp_init(void) * NOTE: When WT or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); } else { /* * Full PAT support. We put WT in slot 7 to improve @@ -347,13 +341,12 @@ void __init pat_bp_init(void) * The reserved slots are unused, but mapped to their * corresponding types in the presence of PAT errata. */ - pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); + pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); } memory_caching_control |= CACHE_PAT; init_cache_modes(pat_msr_val); -#undef PAT } static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ -- cgit v1.2.3 From d7bfc9ffd58037ff86f9fd0c3cef77cccb555da3 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:12 -0700 Subject: KVM: VMX: Move MSR_IA32_VMX_BASIC bit defines to asm/vmx.h Move the bit defines for MSR_IA32_VMX_BASIC from msr-index.h to vmx.h so that they are colocated with other VMX MSR bit defines, and with the helpers that extract specific information from an MSR_IA32_VMX_BASIC value. Opportunistically use BIT_ULL() instead of open coding hex values. Opportunistically rename VMX_BASIC_64 to VMX_BASIC_32BIT_PHYS_ADDR_ONLY, as "VMX_BASIC_64" is widly misleading. The flag enumerates that addresses are limited to 32 bits, not that 64-bit addresses are allowed. Last but not least, opportunistically #define DUAL_MONITOR_TREATMENT so that all known single-bit feature flags are defined (this will allow replacing open-coded literals in the future). Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Zhao Liu Reviewed-by: Kai Huang Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240605231918.2915961-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 8 -------- arch/x86/include/asm/vmx.h | 7 +++++++ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f6eb3460e169..ad0b579d1c01 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1177,14 +1177,6 @@ #define MSR_IA32_VMX_VMFUNC 0x00000491 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 -/* VMX_BASIC bits and bitmasks */ -#define VMX_BASIC_VMCS_SIZE_SHIFT 32 -#define VMX_BASIC_TRUE_CTLS (1ULL << 55) -#define VMX_BASIC_64 0x0001000000000000LLU -#define VMX_BASIC_MEM_TYPE_SHIFT 50 -#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU -#define VMX_BASIC_INOUT 0x0040000000000000LLU - /* Resctrl MSRs: */ /* - Intel: */ #define MSR_IA32_L3_QOS_CFG 0xc81 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index e531d8d80a11..81b986e501a9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -135,6 +135,13 @@ #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) #define VMFUNC_EPTP_ENTRIES 512 +#define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) +#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) +#define VMX_BASIC_MEM_TYPE_SHIFT 50 +#define VMX_BASIC_INOUT BIT_ULL(54) +#define VMX_BASIC_TRUE_CTLS BIT_ULL(55) + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); -- cgit v1.2.3 From 9df398ff7d2ab4fa2ecf6131044431dc94c5bdf6 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:13 -0700 Subject: KVM: VMX: Track CPU's MSR_IA32_VMX_BASIC as a single 64-bit value Track the "basic" capabilities VMX MSR as a single u64 in vmcs_config instead of splitting it across three fields, that obviously don't combine into a single 64-bit value, so that KVM can use the macros that define MSR bits using their absolute position. Replace all open coded shifts and masks, many of which are relative to the "high" half, with the appropriate macro. Opportunistically use VMX_BASIC_32BIT_PHYS_ADDR_ONLY instead of an open coded equivalent, and clean up the related comment to not reference a specific SDM section (to the surprise of no one, the comment is stale). No functional change intended (though obviously the code generation will be quite different). Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20240605231918.2915961-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 5 +++++ arch/x86/kvm/vmx/capabilities.h | 6 ++---- arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++++-------------- 3 files changed, 21 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 81b986e501a9..90963b14afaa 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -152,6 +152,11 @@ static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; } +static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; +} + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 41a4533f9989..86ce8bb96bed 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -54,9 +54,7 @@ struct nested_vmx_msrs { }; struct vmcs_config { - int size; - u32 basic_cap; - u32 revision_id; + u64 basic; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; @@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); + return vmcs_config.basic & VMX_BASIC_INOUT; } static inline bool cpu_has_virtual_nmis(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a4d077db04cf..5cbd86c4386b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2605,13 +2605,13 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { - u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 basic_msr; u64 misc_msr; int i; @@ -2734,29 +2734,29 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, _vmexit_control &= ~x_ctrl; } - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) + /* + * KVM expects to be able to shove all legal physical addresses into + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always + * 0 for processors that support Intel 64 architecture". + */ + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != X86_MEMTYPE_WB) + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; rdmsrl(MSR_IA32_VMX_MISC, misc_msr); - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; - + vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; @@ -2903,13 +2903,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) if (!pages) return NULL; vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); /* KVM supports Enlightened VMCS v1 only */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); if (shadow) vmcs->hdr.shadow_vmcs = 1; @@ -3002,7 +3002,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (kvm_is_using_evmcs()) - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); per_cpu(vmxarea, cpu) = vmcs; } -- cgit v1.2.3 From 92e648042c237c3bba223ab7f7e1a76f41ab3ef7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:15 -0700 Subject: KVM: nVMX: Add a helper to encode VMCS info in MSR_IA32_VMX_BASIC Add a helper to encode the VMCS revision, size, and supported memory types in MSR_IA32_VMX_BASIC, i.e. when synthesizing KVM's supported BASIC MSR value, and delete the now unused VMCS size and memtype shift macros. For a variety of reasons, KVM has shifted (pun intended) to using helpers to *get* information from the VMX MSRs, as opposed to defined MASK and SHIFT macros for direct use. Provide a similar helper for the nested VMX code, which needs to *set* information, so that KVM isn't left with a mix of SHIFT macros and dedicated helpers. Reported-by: Xiaoyao Li Reviewed-by: Xiaoyao Li Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20240605231918.2915961-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 7 +++++-- arch/x86/kvm/vmx/nested.c | 8 +++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 90963b14afaa..65aaf0577265 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -135,10 +135,8 @@ #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) #define VMFUNC_EPTP_ENTRIES 512 -#define VMX_BASIC_VMCS_SIZE_SHIFT 32 #define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) -#define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_INOUT BIT_ULL(54) #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) @@ -157,6 +155,11 @@ static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; } +static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) +{ + return revision | ((u64)size << 32) | ((u64)memtype << 50); +} + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index eba5e94a3c7c..335f1a650497 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7077,12 +7077,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (X86_MEMTYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, + X86_MEMTYPE_WB); + msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; } -- cgit v1.2.3 From dc1e67f70f6d4e336da06cb61bf6669d6fa39869 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Jun 2024 16:19:16 -0700 Subject: KVM VMX: Move MSR_IA32_VMX_MISC bit defines to asm/vmx.h Move the handful of MSR_IA32_VMX_MISC bit defines that are currently in msr-indx.h to vmx.h so that all of the VMX_MISC defines and wrappers can be found in a single location. Opportunistically use BIT_ULL() instead of open coding hex values, add defines for feature bits that are architecturally defined, and move the defines down in the file so that they are colocated with the helpers for getting fields from VMX_MISC. No functional change intended. Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Zhao Liu Reviewed-by: Kai Huang Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240605231918.2915961-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr-index.h | 5 ----- arch/x86/include/asm/vmx.h | 19 ++++++++++++------- arch/x86/kvm/vmx/capabilities.h | 4 ++-- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/nested.h | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ad0b579d1c01..72c2c0ecb62c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1194,11 +1194,6 @@ #define MSR_IA32_SMBA_BW_BASE 0xc0000280 #define MSR_IA32_EVT_CFG_BASE 0xc0000400 -/* MSR_IA32_VMX_MISC bits */ -#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) -#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) -#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F - /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 65aaf0577265..400819ccb42c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -122,13 +122,6 @@ #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff -#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f -#define VMX_MISC_SAVE_EFER_LMA 0x00000020 -#define VMX_MISC_ACTIVITY_HLT 0x00000040 -#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 -#define VMX_MISC_ZERO_LEN_INS 0x40000000 -#define VMX_MISC_MSR_LIST_MULTIPLIER 512 - /* VMFUNC functions */ #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) @@ -160,6 +153,18 @@ static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) return revision | ((u64)size << 32) | ((u64)memtype << 50); } +#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK GENMASK_ULL(4, 0) +#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) +#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) +#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) +#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8) +#define VMX_MISC_INTEL_PT BIT_ULL(14) +#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15) +#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28) +#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29) +#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30) +#define VMX_MISC_MSR_LIST_MULTIPLIER 512 + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 86ce8bb96bed..cb6588238f46 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -223,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void) static inline bool cpu_has_vmx_shadow_vmcs(void) { /* check if the cpu supports writing r/o exit information fields */ - if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) return false; return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -365,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void) static inline bool cpu_has_vmx_intel_pt(void) { - return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && + return (vmcs_config.misc & VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 335f1a650497..72db3718d8b2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7062,7 +7062,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, { msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index cce4e2aa30fb..0782fe599757 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -109,7 +109,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; } static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 8f56b14e9fa0f91daf4d3be05988abf336255dc6 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 16:19:17 -0700 Subject: KVM: VMX: Open code VMX preemption timer rate mask in its accessor Use vmx_misc_preemption_timer_rate() to get the rate in hardware_setup(), and open code the rate's bitmask in vmx_misc_preemption_timer_rate() so that the function looks like all the helpers that grab values from VMX_BASIC and VMX_MISC MSR values. No functional change intended. Cc: Shan Kang Cc: Kai Huang Signed-off-by: Xin Li [sean: split to separate patch, write changelog] Reviewed-by: Kai Huang Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240605231918.2915961-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 3 +-- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 400819ccb42c..f7fd4369b821 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -153,7 +153,6 @@ static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) return revision | ((u64)size << 32) | ((u64)memtype << 50); } -#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK GENMASK_ULL(4, 0) #define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) #define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) #define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) @@ -167,7 +166,7 @@ static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { - return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + return vmx_misc & GENMASK_ULL(4, 0); } static inline int vmx_misc_cr3_count(u64 vmx_misc) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5cbd86c4386b..39a26ecf87ea 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8517,7 +8517,7 @@ __init int vmx_hardware_setup(void) u64 use_timer_freq = 5000ULL * 1000 * 1000; cpu_preemption_timer_multi = - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmx_misc_preemption_timer_rate(vmcs_config.misc); if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; -- cgit v1.2.3 From 653ea4489e6989f14a87abf8653f77c089097326 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Jul 2024 16:59:22 -0700 Subject: KVM: nVMX: Honor userspace MSR filter lists for nested VM-Enter/VM-Exit Synthesize a consistency check VM-Exit (VM-Enter) or VM-Abort (VM-Exit) if L1 attempts to load/store an MSR via the VMCS MSR lists that userspace has disallowed access to via an MSR filter. Intel already disallows including a handful of "special" MSRs in the VMCS lists, so denying access isn't completely without precedent. More importantly, the behavior is well-defined _and_ can be communicated the end user, e.g. to the customer that owns a VM running as L1 on top of KVM. On the other hand, ignoring userspace MSR filters is all but guaranteed to result in unexpected behavior as the access will hit KVM's internal state, which is likely not up-to-date. Unlike KVM-internal accesses, instruction emulation, and dedicated VMCS fields, the MSRs in the VMCS load/store lists are 100% guest controlled, thus making it all but impossible to reason about the correctness of ignoring the MSR filter. And if userspace *really* wants to deny access to MSRs via the aforementioned scenarios, userspace can hide the associated feature from the guest, e.g. by disabling the PMU to prevent accessing PERF_GLOBAL_CTRL via its VMCS field. But for the MSR lists, KVM is blindly processing MSRs; the MSR filters are the _only_ way for userspace to deny access. This partially reverts commit ac8d6cad3c7b ("KVM: x86: Only do MSR filtering when access MSR by rdmsr/wrmsr"). Cc: Hou Wenlong Cc: Jim Mattson Link: https://lore.kernel.org/r/20240722235922.3351122-1-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++---- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/x86.c | 6 ++++-- 4 files changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b3be87489108..a4b7dc4a9dda 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4214,7 +4214,9 @@ whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace on denied accesses, i.e. userspace effectively intercepts the MSR access. If KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest -on denied accesses. +on denied accesses. Note, if an MSR access is denied during emulation of MSR +load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER. +See the below warning for full details. If an MSR access is allowed by userspace, KVM will emulate and/or virtualize the access in accordance with the vCPU model. Note, KVM may still ultimately @@ -4229,9 +4231,22 @@ filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes an error. .. warning:: - MSR accesses as part of nested VM-Enter/VM-Exit are not filtered. - This includes both writes to individual VMCS fields and reads/writes - through the MSR lists pointed to by the VMCS. + MSR accesses that are side effects of instruction execution (emulated or + native) are not filtered as hardware does not honor MSR bitmaps outside of + RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions + to avoid pointless divergence from hardware. E.g. RDPID reads MSR_TSC_AUX, + SYSENTER reads the SYSENTER MSRs, etc. + + MSRs that are loaded/stored via dedicated VMCS fields are not filtered as + part of VM-Enter/VM-Exit emulation. + + MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part + of VM-Enter/VM-Exit emulation. If an MSR access is denied on VM-Enter, KVM + synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL). If an + MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort. In short, KVM + extends Intel's architectural list of MSRs that cannot be loaded/saved via + the VM-Enter/VM-Exit MSR list. It is platform owner's responsibility to + to communicate any such restrictions to their end users. x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that cover any x2APIC MSRs). diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78..4a93ac1b9be9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2060,6 +2060,8 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2392a7ef254d..674f7089cc44 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -981,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr(vcpu, e.index, e.value)) { + if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -1017,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr(vcpu, msr_index, data)) { + if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -1112,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() - * instead of reading the value from the vmcs02 VMExit - * MSR-store area. + * nested_vmx_get_vmexit_msr_value() by reading KVM's + * internal MSR state instead of reading the value from + * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", @@ -4806,7 +4806,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr(vcpu, h.index, h.value)) { + if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e406987..34b52b49f5e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1940,19 +1940,21 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, return ret; } -static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } +EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); -static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } +EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { -- cgit v1.2.3 From 74c6c98a598a1fa650f9f8dfb095d66e987ed9cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:29 -0700 Subject: KVM: x86: Refactor kvm_x86_ops.get_msr_feature() to avoid kvm_msr_entry Refactor get_msr_feature() to take the index and data pointer as distinct parameters in anticipation of eliminating "struct kvm_msr_entry" usage further up the primary callchain. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 16 +++++++--------- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/vmx/x86_ops.h | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 13 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78..fe61cff1f49d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1806,7 +1806,7 @@ struct kvm_x86_ops { int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(struct kvm_msr_entry *entry); + int (*get_msr_feature)(u32 msr, u64 *data); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a265f6e621fc..314dd4aacfe9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2825,14 +2825,14 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(struct kvm_msr_entry *msr) +static int svm_get_msr_feature(u32 msr, u64 *data) { - msr->data = 0; + *data = 0; - switch (msr->index) { + switch (msr) { case MSR_AMD64_DE_CFG: if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: return KVM_MSR_RET_UNSUPPORTED; @@ -3179,14 +3179,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { - struct kvm_msr_entry msr_entry; + u64 supported_de_cfg; - msr_entry.index = msr->index; - if (svm_get_msr_feature(&msr_entry)) + if (svm_get_msr_feature(ecx, &supported_de_cfg)) return 1; - /* Check the supported bits */ - if (data & ~msr_entry.data) + if (data & ~supported_de_cfg) return 1; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e5b253e4d421..3d24eb4aeca2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1998,13 +1998,13 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_msr_feature(u32 msr, u64 *data) { - switch (msr->index) { + switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: return KVM_MSR_RET_UNSUPPORTED; } diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3221cd1d01..9a0304eb847b 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -56,7 +56,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -int vmx_get_msr_feature(struct kvm_msr_entry *msr); +int vmx_get_msr_feature(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea2f3ff93957..29d1205f62d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1672,7 +1672,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return kvm_x86_call(get_msr_feature)(msr); + return kvm_x86_call(get_msr_feature)(msr->index, &msr->data); } return 0; } -- cgit v1.2.3 From b848f24bd74a699745c1145e8cb707884d80694e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 11:19:30 -0700 Subject: KVM: x86: Rename get_msr_feature() APIs to get_feature_msr() Rename all APIs related to feature MSRs from get_msr_feature() to get_feature_msr(). The APIs get "feature MSRs", not "MSR features". And unlike kvm_{g,s}et_msr_common(), the "feature" adjective doesn't describe the helper itself. No functional change intended. Link: https://lore.kernel.org/r/20240802181935.292540-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/vmx/main.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/x86_ops.h | 2 +- arch/x86/kvm/x86.c | 12 ++++++------ 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 68ad4f923664..9afbf8bcb521 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) -KVM_X86_OP(get_msr_feature) +KVM_X86_OP(get_feature_msr) KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fe61cff1f49d..95396e4cb3da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1806,7 +1806,7 @@ struct kvm_x86_ops { int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(u32 msr, u64 *data); + int (*get_feature_msr)(u32 msr, u64 *data); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 314dd4aacfe9..d8cfe8f23327 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2825,7 +2825,7 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(u32 msr, u64 *data) +static int svm_get_feature_msr(u32 msr, u64 *data) { *data = 0; @@ -3181,7 +3181,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_AMD64_DE_CFG: { u64 supported_de_cfg; - if (svm_get_msr_feature(ecx, &supported_de_cfg)) + if (svm_get_feature_msr(ecx, &supported_de_cfg)) return 1; if (data & ~supported_de_cfg) @@ -5002,7 +5002,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, - .get_msr_feature = svm_get_msr_feature, + .get_feature_msr = svm_get_feature_msr, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0bf35ebe8a1b..4f6023a0deb3 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -41,7 +41,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_put = vmx_vcpu_put, .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, + .get_feature_msr = vmx_get_feature_msr, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3d24eb4aeca2..cf85f8d50ccb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1998,7 +1998,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -int vmx_get_msr_feature(u32 msr, u64 *data) +int vmx_get_feature_msr(u32 msr, u64 *data) { switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9a0304eb847b..eeafd121fb08 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -56,7 +56,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -int vmx_get_msr_feature(u32 msr, u64 *data); +int vmx_get_feature_msr(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29d1205f62d3..3efe086bea4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1659,7 +1659,7 @@ static u64 kvm_get_arch_capabilities(void) return data; } -static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +static int kvm_get_feature_msr(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_ARCH_CAPABILITIES: @@ -1672,12 +1672,12 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return kvm_x86_call(get_msr_feature)(msr->index, &msr->data); + return kvm_x86_call(get_feature_msr)(msr->index, &msr->data); } return 0; } -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct kvm_msr_entry msr; int r; @@ -1685,7 +1685,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) /* Unconditionally clear the output for simplicity */ msr.data = 0; msr.index = index; - r = kvm_get_msr_feature(&msr); + r = kvm_get_feature_msr(&msr); if (r == KVM_MSR_RET_UNSUPPORTED && kvm_msr_ignored_check(index, 0, false)) r = 0; @@ -4943,7 +4943,7 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(NULL, argp, do_get_msr_feature, 1); + r = msr_io(NULL, argp, do_get_feature_msr, 1); break; #ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: @@ -7382,7 +7382,7 @@ static void kvm_probe_feature_msr(u32 msr_index) .index = msr_index, }; - if (kvm_get_msr_feature(&msr)) + if (kvm_get_feature_msr(&msr)) return; msr_based_features[num_msr_based_features++] = msr_index; -- cgit v1.2.3 From 73b42dc69be8564d4951a14d00f827929fe5ef79 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:00 -0700 Subject: KVM: x86: Re-split x2APIC ICR into ICR+ICR2 for AMD (x2AVIC) Re-introduce the "split" x2APIC ICR storage that KVM used prior to Intel's IPI virtualization support, but only for AMD. While not stated anywhere in the APM, despite stating the ICR is a single 64-bit register, AMD CPUs store the 64-bit ICR as two separate 32-bit values in ICR and ICR2. When IPI virtualization (IPIv on Intel, all AVIC flavors on AMD) is enabled, KVM needs to match CPU behavior as some ICR ICR writes will be handled by the CPU, not by KVM. Add a kvm_x86_ops knob to control the underlying format used by the CPU to store the x2APIC ICR, and tune it to AMD vs. Intel regardless of whether or not x2AVIC is enabled. If KVM is handling all ICR writes, the storage format for x2APIC mode doesn't matter, and having the behavior follow AMD versus Intel will provide better test coverage and ease debugging. Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode") Cc: stable@vger.kernel.org Cc: Maxim Levitsky Cc: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20240719235107.3023592-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 42 +++++++++++++++++++++++++++++------------ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/main.c | 2 ++ 4 files changed, 36 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 95396e4cb3da..f9dfb2d62053 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1727,6 +1727,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + + const bool x2apic_icr_is_split; const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 63be07d7c782..c7180cb5f464 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2471,11 +2471,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) data &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg64(apic, APIC_ICR, data); + if (kvm_x86_ops.x2apic_icr_is_split) { + kvm_lapic_set_reg(apic, APIC_ICR, data); + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); + } else { + kvm_lapic_set_reg64(apic, APIC_ICR, data); + } trace_kvm_apic_write(APIC_ICR, data); return 0; } +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) +{ + if (kvm_x86_ops.x2apic_icr_is_split) + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; + + return kvm_lapic_get_reg64(apic, APIC_ICR); +} + /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { @@ -2493,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) - WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR))); + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } @@ -3013,18 +3027,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, /* * In x2APIC mode, the LDR is fixed and based on the id. And - * ICR is internally a single 64-bit register, but needs to be - * split to ICR+ICR2 in userspace for backwards compatibility. + * if the ICR is _not_ split, ICR is internally a single 64-bit + * register, but needs to be split to ICR+ICR2 in userspace for + * backwards compatibility. */ - if (set) { + if (set) *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); - } else { - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + if (!kvm_x86_ops.x2apic_icr_is_split) { + if (set) { + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + } else { + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + } } } @@ -3222,7 +3240,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) u32 low; if (reg == APIC_ICR) { - *data = kvm_lapic_get_reg64(apic, APIC_ICR); + *data = kvm_x2apic_icr_read(apic); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d8cfe8f23327..eb3de01602b9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5053,6 +5053,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + + .x2apic_icr_is_split = true, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .apicv_post_state_restore = avic_apicv_post_state_restore, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 4f6023a0deb3..0a094ebad4b1 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -89,6 +89,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_nmi_window = vmx_enable_nmi_window, .enable_irq_window = vmx_enable_irq_window, .update_cr8_intercept = vmx_update_cr8_intercept, + + .x2apic_icr_is_split = false, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, -- cgit v1.2.3 From 1b5ef14dc656a25280d56795b73cf90dad64ad44 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:45:10 -0700 Subject: KVM: SVM: Add host SEV-ES save area structure into VMCB via a union Incorporate the _host_ SEV-ES save area into the VMCB as a union with the legacy save area. The SEV-ES variant used to save/load host state is larger than the legacy save area, but resides at the same offset. Prefix the field with "host" to make it as obvious as possible that the SEV-ES variant in the VMCB is only ever used for host state. Guest state for SEV-ES VMs is stored in a completely separate page (VMSA), albeit with the same layout as the host state. Add a compile-time assert to ensure the VMCB layout is correct, i.e. that KVM's layout matches the architectural definitions. No functional change intended. Link: https://lore.kernel.org/r/20240802204511.352017-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f0dea3750ca9..2b59b9951c90 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -516,6 +516,20 @@ struct ghcb { u32 ghcb_usage; } __packed; +struct vmcb { + struct vmcb_control_area control; + union { + struct vmcb_save_area save; + + /* + * For SEV-ES VMs, the save area in the VMCB is used only to + * save/load host state. Guest state resides in a separate + * page, the aptly named VM Save Area (VMSA), that is encrypted + * with the guest's private key. + */ + struct sev_es_save_area host_sev_es_save; + }; +} __packed; #define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 @@ -532,6 +546,7 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); /* Check offsets of reserved fields */ @@ -568,11 +583,6 @@ static inline void __unused_size_checks(void) BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } -struct vmcb { - struct vmcb_control_area control; - struct vmcb_save_area save; -} __packed; - #define SVM_CPUID_FUNC 0x8000000a #define SVM_SELECTOR_S_SHIFT 4 -- cgit v1.2.3 From f7f39c50edb9d336274371953275e0d3503b9b75 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 12:51:18 -0700 Subject: KVM: x86: Exit to userspace if fastpath triggers one on instruction skip Exit to userspace if a fastpath handler triggers such an exit, which can happen when skipping the instruction, e.g. due to userspace single-stepping the guest via KVM_GUESTDBG_SINGLESTEP or because of an emulation failure. Fixes: 404d5d7bff0d ("KVM: X86: Introduce more exit_fastpath_completion enum values") Link: https://lore.kernel.org/r/20240802195120.325560-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f9dfb2d62053..430a0d369322 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -211,6 +211,7 @@ enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, + EXIT_FASTPATH_EXIT_USERSPACE, }; typedef enum exit_fastpath_completion fastpath_t; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d392e9097d63..d6c81d59d487 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2206,8 +2206,10 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) } if (handled) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; + if (!kvm_skip_emulated_instruction(vcpu)) + ret = EXIT_FASTPATH_EXIT_USERSPACE; + else + ret = EXIT_FASTPATH_REENTER_GUEST; trace_kvm_msr_write(msr, data); } else { ret = EXIT_FASTPATH_NONE; @@ -11196,6 +11198,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; + r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; -- cgit v1.2.3 From 0617a769ce16b836659c8a712f394bfa3543a587 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:56 -0700 Subject: KVM: x86: Rename virtualization {en,dis}abling APIs to match common KVM Rename x86's the per-CPU vendor hooks used to enable virtualization in hardware to align with the recently renamed arch hooks. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20240830043600.127750-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 4 ++-- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm/svm.c | 18 +++++++++--------- arch/x86/kvm/vmx/main.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 10 +++++----- arch/x86/kvm/vmx/x86_ops.h | 4 ++-- arch/x86/kvm/x86.c | 10 +++++----- 7 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 68ad4f923664..03b7e13f15bb 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,8 +14,8 @@ BUILD_BUG_ON(1) * be __static_call_return0. */ KVM_X86_OP(check_processor_compatibility) -KVM_X86_OP(hardware_enable) -KVM_X86_OP(hardware_disable) +KVM_X86_OP(enable_virtualization_cpu) +KVM_X86_OP(disable_virtualization_cpu) KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e4fc362ba3da..704aeecfe2c9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1629,8 +1629,8 @@ struct kvm_x86_ops { int (*check_processor_compatibility)(void); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d6f252555ab3..a9adbe10c12e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void) } } -static void svm_emergency_disable(void) +static void svm_emergency_disable_virtualization_cpu(void) { kvm_rebooting = true; kvm_cpu_svm_disable(); } -static void svm_hardware_disable(void) +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) @@ -610,7 +610,7 @@ static void svm_hardware_disable(void) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; @@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available. The user return MSR support is not required in this case * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_hardware_enable()). + * (which has been initialized in svm_enable_virtualization_cpu()). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -3132,7 +3132,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT * from the host save area (which has been initialized in - * svm_hardware_enable()). + * svm_enable_virtualization_cpu()). */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -4980,8 +4980,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -5411,7 +5411,7 @@ static void __svm_exit(void) { kvm_x86_vendor_exit(); - cpu_emergency_unregister_virt_callback(svm_emergency_disable); + cpu_emergency_unregister_virt_callback(svm_emergency_disable_virtualization_cpu); } static int __init svm_init(void) @@ -5427,7 +5427,7 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable); + cpu_emergency_register_virt_callback(svm_emergency_disable_virtualization_cpu); /* * Common KVM initialization _must_ come last, after this, /dev/kvm is diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0bf35ebe8a1b..4a5bf92edccf 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -23,8 +23,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476..cf7d937bfd2c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -755,7 +755,7 @@ fault: return -EIO; } -static void vmx_emergency_disable(void) +static void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -2844,7 +2844,7 @@ fault: return -EFAULT; } -int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2881,7 +2881,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -8584,7 +8584,7 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); + cpu_emergency_unregister_virt_callback(vmx_emergency_disable_virtualization_cpu); vmx_cleanup_l1d_flush(); } @@ -8632,7 +8632,7 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); + cpu_emergency_register_virt_callback(vmx_emergency_disable_virtualization_cpu); vmx_check_vmcs12_offsets(); diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3221cd1d01..205692c43a8e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -13,8 +13,8 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata; void vmx_hardware_unsetup(void); int vmx_check_processor_compat(void); -int vmx_hardware_enable(void); -void vmx_hardware_disable(void); +int vmx_enable_virtualization_cpu(void); +void vmx_disable_virtualization_cpu(void); int vmx_vm_init(struct kvm *kvm); void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1182baf0d487..431358167fa8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9749,7 +9749,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) guard(mutex)(&vendor_module_lock); - if (kvm_x86_ops.hardware_enable) { + if (kvm_x86_ops.enable_virtualization_cpu) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } @@ -9876,7 +9876,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return 0; out_unwind_ops: - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); @@ -9917,7 +9917,7 @@ void kvm_x86_vendor_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif mutex_lock(&vendor_module_lock); - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); @@ -12528,7 +12528,7 @@ int kvm_arch_enable_virtualization_cpu(void) if (ret) return ret; - ret = kvm_x86_call(hardware_enable)(); + ret = kvm_x86_call(enable_virtualization_cpu)(); if (ret != 0) return ret; @@ -12610,7 +12610,7 @@ int kvm_arch_enable_virtualization_cpu(void) void kvm_arch_disable_virtualization_cpu(void) { - kvm_x86_call(hardware_disable)(); + kvm_x86_call(disable_virtualization_cpu)(); drop_user_return_notifiers(); } -- cgit v1.2.3 From 6d55a94222dbc64754fc138dbe4578dc5cac1c8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:35:59 -0700 Subject: x86/reboot: Unconditionally define cpu_emergency_virt_cb typedef Define cpu_emergency_virt_cb even if the kernel is being built without KVM support so that KVM can reference the typedef in asm/kvm_host.h without needing yet more #ifdefs. No functional change intended. Acked-by: Kai Huang Reviewed-by: Chao Gao Reviewed-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/reboot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 6536873f8fc0..d0ef2a678d66 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,8 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) typedef void (cpu_emergency_virt_cb)(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); -- cgit v1.2.3 From 590b09b1d88e18ae57f89930a6f7b89795d2e9f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 29 Aug 2024 21:36:00 -0700 Subject: KVM: x86: Register "emergency disable" callbacks when virt is enabled Register the "disable virtualization in an emergency" callback just before KVM enables virtualization in hardware, as there is no functional need to keep the callbacks registered while KVM happens to be loaded, but is inactive, i.e. if KVM hasn't enabled virtualization. Note, unregistering the callback every time the last VM is destroyed could have measurable latency due to the synchronize_rcu() needed to ensure all references to the callback are dropped before KVM is unloaded. But the latency should be a small fraction of the total latency of disabling virtualization across all CPUs, and userspace can set enable_virt_at_load to completely eliminate the runtime overhead. Add a pointer in kvm_x86_ops to allow vendor code to provide its callback. There is no reason to force vendor code to do the registration, and either way KVM would need a new kvm_x86_ops hook. Suggested-by: Kai Huang Reviewed-by: Chao Gao Reviewed-by: Kai Huang Acked-by: Kai Huang Tested-by: Farrah Chen Signed-off-by: Sean Christopherson Message-ID: <20240830043600.127750-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 5 +---- arch/x86/kvm/vmx/main.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 6 +----- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 10 ++++++++++ 6 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 704aeecfe2c9..52443ccda320 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -36,6 +36,7 @@ #include #include #include +#include #define __KVM_HAVE_ARCH_VCPU_DEBUGFS @@ -1631,6 +1632,8 @@ struct kvm_x86_ops { int (*enable_virtualization_cpu)(void); void (*disable_virtualization_cpu)(void); + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; + void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a9adbe10c12e..9a0506ef87df 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4982,6 +4982,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_unsetup, .enable_virtualization_cpu = svm_enable_virtualization_cpu, .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -5410,8 +5411,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static void __svm_exit(void) { kvm_x86_vendor_exit(); - - cpu_emergency_unregister_virt_callback(svm_emergency_disable_virtualization_cpu); } static int __init svm_init(void) @@ -5427,8 +5426,6 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable_virtualization_cpu); - /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 4a5bf92edccf..7e2e78a14257 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -25,6 +25,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_virtualization_cpu = vmx_enable_virtualization_cpu, .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, + .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cf7d937bfd2c..89682832dded 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -755,7 +755,7 @@ fault: return -EIO; } -static void vmx_emergency_disable_virtualization_cpu(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -8584,8 +8584,6 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable_virtualization_cpu); - vmx_cleanup_l1d_flush(); } @@ -8632,8 +8630,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable_virtualization_cpu); - vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 205692c43a8e..b6a7cfc6ae31 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -15,6 +15,7 @@ void vmx_hardware_unsetup(void); int vmx_check_processor_compat(void); int vmx_enable_virtualization_cpu(void); void vmx_disable_virtualization_cpu(void); +void vmx_emergency_disable_virtualization_cpu(void); int vmx_vm_init(struct kvm *kvm); void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 431358167fa8..f72e5d89e942 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12512,6 +12512,16 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); +void kvm_arch_enable_virtualization(void) +{ + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +void kvm_arch_disable_virtualization(void) +{ + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; -- cgit v1.2.3 From 363010e1dd0efd4778637c1a5a5aaffbcfcae919 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Sep 2024 21:34:08 -0700 Subject: KVM: nVMX: Get to-be-acknowledge IRQ for nested VM-Exit at injection site Move the logic to get the to-be-acknowledge IRQ for a nested VM-Exit from nested_vmx_vmexit() to vmx_check_nested_events(), which is subtly the one and only path where KVM invokes nested_vmx_vmexit() with EXIT_REASON_EXTERNAL_INTERRUPT. A future fix will perform a last-minute check on L2's nested posted interrupt notification vector, just before injecting a nested VM-Exit. To handle that scenario correctly, KVM needs to get the interrupt _before_ injecting VM-Exit, as simply querying the highest priority interrupt, via kvm_cpu_has_interrupt(), would result in TOCTOU bug, as a new, higher priority interrupt could arrive between kvm_cpu_has_interrupt() and kvm_cpu_get_interrupt(). Unfortunately, simply moving the call to kvm_cpu_get_interrupt() doesn't suffice, as a VMWRITE to GUEST_INTERRUPT_STATUS.SVI is hiding in kvm_get_apic_interrupt(), and acknowledging the interrupt before nested VM-Exit would cause the VMWRITE to hit vmcs02 instead of vmcs01. Open code a rough equivalent to kvm_cpu_get_interrupt() so that the IRQ is acknowledged after emulating VM-Exit, taking care to avoid the TOCTOU issue described above. Opportunistically convert the WARN_ON() to a WARN_ON_ONCE(). If KVM has a bug that results in a false positive from kvm_cpu_has_interrupt(), spamming dmesg won't help the situation. Note, nested_vmx_reflect_vmexit() can never reflect external interrupts as they are always "wanted" by L0. Link: https://lore.kernel.org/r/20240906043413.1049633-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq.c | 3 ++- arch/x86/kvm/vmx/nested.c | 36 +++++++++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a93ac1b9be9..aa31c4b94977 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2256,6 +2256,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_extint(struct kvm_vcpu *v); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3d7eb11d0e45..810da99ff7ed 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); * Read pending interrupt(from non-APIC source) * vector and intack. */ -static int kvm_cpu_get_extint(struct kvm_vcpu *v) +int kvm_cpu_get_extint(struct kvm_vcpu *v) { if (!kvm_cpu_has_extint(v)) { WARN_ON(!lapic_in_kernel(v)); @@ -131,6 +131,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) } else return kvm_pic_read_irq(v->kvm); /* PIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); /* * Read pending interrupt vector and intack. diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 867de342df33..1a6dc85cde18 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4285,11 +4285,37 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { + int irq; + if (block_nested_events) return -EBUSY; if (!nested_exit_on_intr(vcpu)) goto no_vmexit; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + + if (!nested_exit_intr_ack_set(vcpu)) { + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + irq = kvm_cpu_get_extint(vcpu); + if (irq != -1) { + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + return 0; + } + + irq = kvm_apic_has_interrupt(vcpu); + WARN_ON_ONCE(irq < 0); + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + + /* + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI + * if APICv is active. + */ + kvm_apic_ack_interrupt(vcpu, irq); return 0; } @@ -4970,14 +4996,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - nested_exit_intr_ack_set(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, -- cgit v1.2.3 From 4ececec19a0914873634ad69bbaca5557c33e855 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:17 -0700 Subject: KVM: x86/mmu: Replace PFERR_NESTED_GUEST_PAGE with a more descriptive helper Drop the globally visible PFERR_NESTED_GUEST_PAGE and replace it with a more appropriately named is_write_to_guest_page_table(). The macro name is misleading, because while all nNPT walks match PAGE|WRITE|PRESENT, the reverse is not true. No functional change intended. Link: https://lore.kernel.org/r/20240831001538.336683-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 ---- arch/x86/kvm/mmu/mmu.c | 9 ++++++++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78..797433994d56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -280,10 +280,6 @@ enum x86_intercept_stage; #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_WRITE_MASK | \ - PFERR_PRESENT_MASK) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5226bb055d99..1eedd1932e5f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5945,6 +5945,13 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, write_unlock(&vcpu->kvm->mmu_lock); } +static bool is_write_to_guest_page_table(u64 error_code) +{ + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; + + return (error_code & mask) == mask; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -6008,7 +6015,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err * and resume the guest. */ if (vcpu->arch.mmu->root_role.direct && - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { + is_write_to_guest_page_table(error_code)) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } -- cgit v1.2.3 From 01dd4d319207c4cfd51a1c9a1812909e944d8c86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:23 -0700 Subject: KVM: x86/mmu: Apply retry protection to "fast nTDP unprotect" path Move the anti-infinite-loop protection provided by last_retry_{eip,addr} into kvm_mmu_write_protect_fault() so that it guards unprotect+retry that never hits the emulator, as well as reexecute_instruction(), which is the last ditch "might as well try it" logic that kicks in when emulation fails on an instruction that faulted on a write-protected gfn. Add a new helper, kvm_mmu_unprotect_gfn_and_retry(), to set the retry fields and deduplicate other code (with more to come). Link: https://lore.kernel.org/r/20240831001538.336683-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 39 ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 27 +-------------------------- 3 files changed, 40 insertions(+), 27 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 797433994d56..fd115feb49b3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2133,6 +2133,7 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); +bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa); void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index edca32a4e874..e992da21f4e5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2713,6 +2713,22 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } +bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) +{ + gpa_t gpa = cr2_or_gpa; + bool r; + + if (!vcpu->arch.mmu->root_role.direct) + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + if (r) { + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); + vcpu->arch.last_retry_addr = cr2_or_gpa; + } + return r; +} + static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; @@ -5956,6 +5972,27 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, { bool direct = vcpu->arch.mmu->root_role.direct; + /* + * Do not try to unprotect and retry if the vCPU re-faulted on the same + * RIP with the same address that was previously unprotected, as doing + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses + * a non-page-table modifying instruction on the PDE that points to the + * instruction, then unprotecting the gfn will unmap the instruction's + * code, i.e. make it impossible for the instruction to ever complete. + */ + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && + vcpu->arch.last_retry_addr == cr2_or_gpa) + return RET_PF_EMULATE; + + /* + * Reset the unprotect+retry values that guard against infinite loops. + * The values will be refreshed if KVM explicitly unprotects a gfn and + * retries, in all other cases it's safe to retry in the future even if + * the next page fault happens on the same RIP+address. + */ + vcpu->arch.last_retry_eip = 0; + vcpu->arch.last_retry_addr = 0; + /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT @@ -5986,7 +6023,7 @@ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * format) with L2's page tables (EPT format). */ if (direct && is_write_to_guest_page_table(error_code) && - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d26e107225f7..ce075e07126b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8932,27 +8932,13 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr; - gpa_t gpa = cr2_or_gpa; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; /* * If the emulation is caused by #PF and it is non-page_table * writing instruction, it means the VM-EXIT is caused by shadow * page protected, we can zap the shadow page and retry this * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; @@ -8963,18 +8949,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) - return false; - - if (!vcpu->arch.mmu->root_role.direct) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - if (!kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa))) - return false; - - vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); - vcpu->arch.last_retry_addr = cr2_or_gpa; - return true; + return kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); } static int complete_emulated_mmio(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 4df685664bed04794ad72b58d8af1fa4fcc60261 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:33 -0700 Subject: KVM: x86: Update retry protection fields when forcing retry on emulation failure When retrying the faulting instruction after emulation failure, refresh the infinite loop protection fields even if no shadow pages were zapped, i.e. avoid hitting an infinite loop even when retrying the instruction as a last-ditch effort to avoid terminating the guest. Link: https://lore.kernel.org/r/20240831001538.336683-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 10 +++++++++- arch/x86/kvm/mmu/mmu.c | 12 +++++++----- arch/x86/kvm/x86.c | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fd115feb49b3..cdee59f3d15b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2133,7 +2133,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); -bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 195ba7430720..4b4edaf7dc06 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2713,10 +2713,11 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) return r; } -bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry) { gpa_t gpa = cr2_or_gpa; - bool r; + bool r = false; /* * Bail early if there aren't any write-protected shadow pages to avoid @@ -2727,16 +2728,17 @@ bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa) * skipping the unprotect+retry path, which is also an optimization. */ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) - return false; + goto out; if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); if (gpa == INVALID_GPA) - return false; + goto out; } r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - if (r) { +out: + if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ad942892fa2c..843ddb982b35 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8886,7 +8886,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * guest to let the CPU re-execute the instruction in the hope that the * CPU can cleanly execute the instruction that KVM failed to emulate. */ - kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa); + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); /* * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible -- cgit v1.2.3 From 6b3dcabc10911711eba15816d808e2a18f130406 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Aug 2024 17:15:35 -0700 Subject: KVM: x86/mmu: Subsume kvm_mmu_unprotect_page() into the and_retry() version Fold kvm_mmu_unprotect_page() into kvm_mmu_unprotect_gfn_and_retry() now that all other direct usage is gone. No functional change intended. Link: https://lore.kernel.org/r/20240831001538.336683-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++-------------------- 2 files changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cdee59f3d15b..8f4164f58b6c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2132,7 +2132,6 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4b4edaf7dc06..29305403f956 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2695,27 +2695,12 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) write_unlock(&kvm->mmu_lock); } -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - int r; - - r = 0; - write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - r = 1; - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); - - return r; -} - bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry) { + struct kvm *kvm = vcpu->kvm; + LIST_HEAD(invalid_list); + struct kvm_mmu_page *sp; gpa_t gpa = cr2_or_gpa; bool r = false; @@ -2727,7 +2712,7 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * positive is benign, and a false negative will simply result in KVM * skipping the unprotect+retry path, which is also an optimization. */ - if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) goto out; if (!vcpu->arch.mmu->root_role.direct) { @@ -2736,7 +2721,15 @@ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, goto out; } - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + r = false; + write_lock(&kvm->mmu_lock); + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) { + r = true; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + out: if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); -- cgit v1.2.3