diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2026-02-09 20:53:47 +0300 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2026-02-09 20:53:47 +0300 |
| commit | 9e03b7caf4e65f5a5841dfed540fdcc3ff061734 (patch) | |
| tree | 9e0e1c19699b568bc2de254abf58fc58b7b13049 | |
| parent | 4215ee0d7bb5358882375c84d3cd0488bb5813b2 (diff) | |
| parent | 6517dfbcc918f970a928d9dc17586904bac06893 (diff) | |
| download | linux-9e03b7caf4e65f5a5841dfed540fdcc3ff061734.tar.xz | |
Merge tag 'kvm-x86-misc-6.20' of https://github.com/kvm-x86/linux into HEAD
KVM x86 misc changes for 6.20
- Disallow changing the virtual CPU model if L2 is active, for all the same
reasons KVM disallows change the model after the first KVM_RUN.
- Fix a bug where KVM would incorrectly reject host accesses to PV MSRs that
were advertised as supported to userspace when running with
KVM_CAP_ENFORCE_PV_FEATURE_CPUID enabled.
- Fix a bug where KVM would attempt to read protect guest state (CR3) when
configuring an async #PF entry.
- Fail the build if EXPORT_SYMBOL_GPL or EXPORT_SYMBOL is used in KVM (for x86
only) to enforce usage of EXPORT_SYMBOL_FOR_KVM_INTERNAL. Explicitly allow
the few exports that are intended for external usage.
- Ignore -EBUSY when checking nested events after a vCPU exits blocking as
the WARN is user-triggerable, and because exiting to userspace on -EBUSY
does more harm than good in pretty much every situation.
- Throw in the towel and drop the WARN on INIT/SIPI being blocked when vCPU is
in Wait-For-SIPI, as playing whack-a-mole with syzkaller turned out to be an
unwinnable game.
- Add support for new Intel instructions that don't require anything beyond
enumerating feature flags to userspace.
- Grab SRCU when reading PDPTRs in KVM_GET_SREGS2.
- Add WARNs to guard against modifying KVM's CPU caps outside of the intended
setup flow, as nested VMX in particular is sensitive to unexpected changes
in KVM's golden configuration.
- Add a quirk to allow userspace to opt-in to actually suppress EOI broadcasts
when the suppression feature is enabled by the guest (currently limited to
split IRQCHIP, i.e. userspace I/O APIC). Sadly, simply fixing KVM to honor
Suppress EOI Broadcasts isn't an option as some userspaces have come to rely
on KVM's buggy behavior (KVM advertises Supress EOI Broadcast irrespective
of whether or not userspace I/O APIC supports Directed EOIs).
- Minor cleanups.
| -rw-r--r-- | Documentation/virt/kvm/api.rst | 28 | ||||
| -rw-r--r-- | arch/x86/include/asm/cpufeatures.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm_host.h | 9 | ||||
| -rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 6 | ||||
| -rw-r--r-- | arch/x86/kvm/Makefile | 49 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.c | 75 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.h | 12 | ||||
| -rw-r--r-- | arch/x86/kvm/ioapic.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 77 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 11 | ||||
| -rw-r--r-- | arch/x86/kvm/pmu.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/reverse_cpuid.h | 19 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/svm.c | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 81 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.h | 15 |
17 files changed, 326 insertions, 69 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 49f043246f95..095095ae01dd 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7908,8 +7908,10 @@ Will return -EBUSY if a VCPU has already been created. Valid feature flags in args[0] are:: - #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) - #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2) + #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3) Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, @@ -7922,6 +7924,28 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC without interrupt remapping. This is undesirable in logical mode, where 0xff represents CPUs 0-7 in cluster 0. +Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable +Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI +Broadcast to the guest and suppress LAPIC EOI broadcasts when the guest +sets the Suppress EOI Broadcast bit in the SPIV register. This flag is +supported only when using a split IRQCHIP. + +Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for +Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise +support to the guest. + +Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST +or KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky +behavior will be used by KVM: in split IRQCHIP mode, KVM will advertise +support for Suppress EOI Broadcasts but not actually suppress EOI +broadcasts; for in-kernel IRQCHIP mode, KVM will not advertise support for +Suppress EOI Broadcasts. + +Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and +KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error, +as will setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST without a split +IRCHIP. + 7.8 KVM_CAP_S390_USER_INSTR0 ---------------------------- diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 81f7b3b91986..c01fdde465de 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -326,6 +326,7 @@ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ +#define X86_FEATURE_MOVRS (12*32+31) /* MOVRS instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0353d8b6988c..91c26f159d89 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -784,6 +784,8 @@ enum kvm_only_cpuid_leafs { CPUID_24_0_EBX, CPUID_8000_0021_ECX, CPUID_7_1_ECX, + CPUID_1E_1_EAX, + CPUID_24_1_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -1234,6 +1236,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +enum kvm_suppress_eoi_broadcast_mode { + KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */ + KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */ + KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */ +}; + struct kvm_x86_msr_filter { u8 count; bool default_allow:1; @@ -1483,6 +1491,7 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode; bool has_mapped_host_mmio; bool guest_can_read_msr_platform_info; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index b2c928c5965d..846a63215ce1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -916,8 +916,10 @@ struct kvm_sev_snp_launch_finish { __u64 pad1[4]; }; -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) +#define KVM_X2APIC_API_USE_32BIT_IDS _BITULL(0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK _BITULL(1) +#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST _BITULL(2) +#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST _BITULL(3) struct kvm_hyperv_eventfd { __u32 conn_id; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c4b8950c7abe..77337c37324b 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -47,3 +47,52 @@ $(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE targets += kvm-asm-offsets.s clean-files += kvm-asm-offsets.h + + +# Fail the build if there is unexpected EXPORT_SYMBOL_GPL (or EXPORT_SYMBOL) +# usage. All KVM-internal exports should use EXPORT_SYMBOL_FOR_KVM_INTERNAL. +# Only a handful of exports intended for other modules (VFIO, KVMGT) should +# use EXPORT_SYMBOL_GPL, and EXPORT_SYMBOL should never be used. +ifdef CONFIG_KVM_X86 +# Search recursively for whole words and print line numbers. Filter out the +# allowed set of exports, i.e. those that are intended for external usage. +exports_grep_trailer := --include='*.[ch]' -nrw $(srctree)/virt/kvm $(srctree)/arch/x86/kvm | \ + grep -v -e kvm_page_track_register_notifier \ + -e kvm_page_track_unregister_notifier \ + -e kvm_write_track_add_gfn \ + -e kvm_write_track_remove_gfn \ + -e kvm_get_kvm \ + -e kvm_get_kvm_safe \ + -e kvm_put_kvm + +# Force grep to emit a goofy group separator that can in turn be replaced with +# the above newline macro (newlines in Make are a nightmare). Note, grep only +# prints the group separator when N lines of context are requested via -C, +# a.k.a. --NUM. Simply request zero lines. Print the separator only after +# filtering out expected exports to avoid extra newlines in the error message. +define get_kvm_exports +$(shell grep "$(1)" -C0 $(exports_grep_trailer) | grep "$(1)" -C0 --group-separator="!SEP!") +endef + +define check_kvm_exports +nr_kvm_exports := $(shell grep "$(1)" $(exports_grep_trailer) | wc -l) + +ifneq (0,$$(nr_kvm_exports)) +$$(error ERROR ***\ +$$(newline)found $$(nr_kvm_exports) unwanted occurrences of $(1):\ +$$(newline) $(subst !SEP!,$$(newline) ,$(call get_kvm_exports,$(1)))\ +$$(newline)in directories:\ +$$(newline) $(srctree)/arch/x86/kvm\ +$$(newline) $(srctree)/virt/kvm\ +$$(newline)Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not $(1)) +endif # nr_kvm_exports != 0 +undefine nr_kvm_exports +endef # check_kvm_exports + +$(eval $(call check_kvm_exports,EXPORT_SYMBOL_GPL)) +$(eval $(call check_kvm_exports,EXPORT_SYMBOL)) + +undefine check_kvm_exports +undefine get_kvm_exports +undefine exports_grep_trailer +endif # CONFIG_KVM_X86 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c590a5bd3196..7fe4e58a6ebf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,9 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); +bool kvm_is_configuring_cpu_caps __read_mostly; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps); + struct cpuid_xstate_sizes { u32 eax; u32 ebx; @@ -534,17 +537,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with - * the core vCPU model on the fly. It would've been better to forbid any - * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately - * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM does not correctly handle changing guest CPUID after KVM_RUN or + * while L2 is active, as MAXPHYADDR, GBPAGES support, AMD reserved bit + * behavior, etc. aren't tracked in kvm_mmu_page_role, and L2 state + * can't be adjusted (without breaking L2 in some way). As a result, + * KVM may reuse SPs/SPTEs and/or run L2 with bad/misconfigured state. + * + * In practice, no sane VMM mucks with the core vCPU model on the fly. + * It would've been better to forbid any KVM_SET_CPUID{,2} calls after + * KVM_RUN or KVM_SET_NESTED_STATE altogether, but unfortunately some + * VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (kvm_vcpu_has_run(vcpu)) { + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) goto err; @@ -823,10 +829,13 @@ do { \ /* DS is defined by ptrace-abi.h on 32-bit builds. */ #undef DS -void kvm_set_cpu_caps(void) +void kvm_initialize_cpu_caps(void) { memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); + WARN_ON_ONCE(kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = true; + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); @@ -1025,6 +1034,7 @@ void kvm_set_cpu_caps(void) F(AMX_FP16), F(AVX_IFMA), F(LAM), + F(MOVRS), ); kvm_cpu_cap_init(CPUID_7_1_ECX, @@ -1063,12 +1073,27 @@ void kvm_set_cpu_caps(void) SCATTERED_F(SGX_EDECCSSA), ); + kvm_cpu_cap_init(CPUID_1E_1_EAX, + F(AMX_INT8_ALIAS), + F(AMX_BF16_ALIAS), + F(AMX_COMPLEX_ALIAS), + F(AMX_FP16_ALIAS), + F(AMX_FP8), + F(AMX_TF32), + F(AMX_AVX512), + F(AMX_MOVRS), + ); + kvm_cpu_cap_init(CPUID_24_0_EBX, F(AVX10_128), F(AVX10_256), F(AVX10_512), ); + kvm_cpu_cap_init(CPUID_24_1_ECX, + F(AVX10_VNNI_INT), + ); + kvm_cpu_cap_init(CPUID_8000_0001_ECX, F(LAHF_LM), F(CMP_LEGACY), @@ -1270,7 +1295,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps); #undef F #undef SCATTERED_F @@ -1624,6 +1649,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } + + max_idx = entry->eax = min(entry->eax, 1u); + + /* KVM only supports up to 0x1e.0x1, capped above via min(). */ + if (max_idx >= 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_1E_1_EAX); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + } break; case 0x24: { u8 avx10_version; @@ -1633,18 +1672,30 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } + max_idx = entry->eax = min(entry->eax, 1u); /* * The AVX10 version is encoded in EBX[7:0]. Note, the version * is guaranteed to be >=1 if AVX10 is supported. Note #2, the * version needs to be captured before overriding EBX features! */ - avx10_version = min_t(u8, entry->ebx & 0xff, 1); + avx10_version = min_t(u8, entry->ebx & 0xff, 2); cpuid_entry_override(entry, CPUID_24_0_EBX); entry->ebx |= avx10_version; - entry->eax = 0; entry->ecx = 0; entry->edx = 0; + + /* KVM only supports up to 0x24.0x1, capped above via min(). */ + if (max_idx >= 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_24_1_ECX); + entry->eax = 0; + entry->ebx = 0; + entry->edx = 0; + } break; } case KVM_CPUID_SIGNATURE: { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d3f5ae15a7ca..039b8e6f40ba 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -8,7 +8,15 @@ #include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; -void kvm_set_cpu_caps(void); +extern bool kvm_is_configuring_cpu_caps __read_mostly; + +void kvm_initialize_cpu_caps(void); + +static inline void kvm_finalize_cpu_caps(void) +{ + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = false; +} void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, @@ -188,6 +196,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } @@ -195,6 +204,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2c2783296aed..a26fa4222f29 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -561,7 +561,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + kvm_lapic_suppress_eoi_broadcast(apic)) return; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1597dd0b0cc6..738ec3c1b0b5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -105,6 +105,63 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } +static bool kvm_lapic_advertise_suppress_eoi_broadcast(struct kvm *kvm) +{ + switch (kvm->arch.suppress_eoi_broadcast_mode) { + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: + return true; + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: + return false; + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: + /* + * The default in-kernel I/O APIC emulates the 82093AA and does not + * implement an EOI register. Some guests (e.g. Windows with the + * Hyper-V role enabled) disable LAPIC EOI broadcast without + * checking the I/O APIC version, which can cause level-triggered + * interrupts to never be EOI'd. + * + * To avoid this, KVM doesn't advertise Suppress EOI Broadcast + * support when using the default in-kernel I/O APIC. + * + * Historically, in split IRQCHIP mode, KVM always advertised + * Suppress EOI Broadcast support but did not actually suppress + * EOIs, resulting in quirky behavior. + */ + return !ioapic_in_kernel(kvm); + default: + WARN_ON_ONCE(1); + return false; + } +} + +bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic) +{ + struct kvm *kvm = apic->vcpu->kvm; + + if (!(kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + return false; + + switch (kvm->arch.suppress_eoi_broadcast_mode) { + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: + return true; + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: + return false; + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: + /* + * Historically, in split IRQCHIP mode, KVM ignored the suppress + * EOI broadcast bit set by the guest and broadcasts EOIs to the + * userspace I/O APIC. For In-kernel I/O APIC, the support itself + * is not advertised, can only be enabled via KVM_SET_APIC_STATE, + * and KVM's I/O APIC doesn't emulate Directed EOIs; but if the + * feature is enabled, it is respected (with odd behavior). + */ + return ioapic_in_kernel(kvm); + default: + WARN_ON_ONCE(1); + return false; + } +} + __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); @@ -554,15 +611,9 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); - /* - * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) - * which doesn't have EOI register; Some buggy OSes (e.g. Windows with - * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC - * version first and level-triggered interrupts never get EOIed in - * IOAPIC. - */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && - !ioapic_in_kernel(vcpu->kvm)) + kvm_lapic_advertise_suppress_eoi_broadcast(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } @@ -1517,6 +1568,15 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { + /* + * Don't exit to userspace if the guest has enabled Directed + * EOI, a.k.a. Suppress EOI Broadcasts, in which case the local + * APIC doesn't broadcast EOIs (the guest must EOI the target + * I/O APIC(s) directly). + */ + if (kvm_lapic_suppress_eoi_broadcast(apic)) + return; + apic->vcpu->arch.pending_ioapic_eoi = vector; kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); return; @@ -3498,7 +3558,6 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) * wait-for-SIPI (WFS). */ if (!kvm_apic_init_sipi_allowed(vcpu)) { - WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); clear_bit(KVM_APIC_SIPI, &apic->pending_events); return 0; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 282b9b7da98c..e5f5a222eced 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -231,6 +231,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic); + void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 02c450686b4a..3911ac9bddfd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4521,7 +4521,10 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; - arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); + if (arch.direct_map) + arch.cr3 = (unsigned long)INVALID_GPA; + else + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); @@ -6031,11 +6034,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); - /* - * Changing guest CPUID after KVM_RUN is forbidden, see the comment in - * kvm_arch_vcpu_ioctl(). - */ - KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); + KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 487ad19a236e..ff20b4102173 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -853,7 +853,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + if (KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm)) return; /* diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 81b4a7acf72e..657f5f743ed9 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -44,11 +44,28 @@ #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* + * Intel-defined sub-features, CPUID level 0x0000001E:1 (EAX). Note, several + * of the bits are aliases to features of the same name that are enumerated via + * various CPUID.0x7 sub-leafs. + */ +#define X86_FEATURE_AMX_INT8_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 0) +#define X86_FEATURE_AMX_BF16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 1) +#define X86_FEATURE_AMX_COMPLEX_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 2) +#define X86_FEATURE_AMX_FP16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 3) +#define X86_FEATURE_AMX_FP8 KVM_X86_FEATURE(CPUID_1E_1_EAX, 4) +#define X86_FEATURE_AMX_TF32 KVM_X86_FEATURE(CPUID_1E_1_EAX, 6) +#define X86_FEATURE_AMX_AVX512 KVM_X86_FEATURE(CPUID_1E_1_EAX, 7) +#define X86_FEATURE_AMX_MOVRS KVM_X86_FEATURE(CPUID_1E_1_EAX, 8) + /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) +/* Intel-defined sub-features, CPUID level 0x00000024:1 (ECX) */ +#define X86_FEATURE_AVX10_VNNI_INT KVM_X86_FEATURE(CPUID_24_1_ECX, 2) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -90,6 +107,8 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, + [CPUID_1E_1_EAX] = { 0x1e, 1, CPUID_EAX}, + [CPUID_24_1_ECX] = { 0x24, 1, CPUID_ECX}, }; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8b0ac67becae..9ee74c57bd51 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5259,7 +5259,7 @@ static __init void svm_adjust_mmio_mask(void) static __init void svm_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); kvm_caps.supported_perf_cap = 0; @@ -5343,6 +5343,7 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); kvm_setup_xss_caps(); + kvm_finalize_cpu_caps(); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 530981a42c96..edf12bf58578 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8039,7 +8039,7 @@ static __init u64 vmx_get_perf_capabilities(void) static __init void vmx_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); /* CPUID 0x1 */ if (nested) @@ -8098,6 +8098,7 @@ static __init void vmx_set_cpu_caps(void) } kvm_setup_xss_caps(); + kvm_finalize_cpu_caps(); } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f66c7441272..08a6d6e20e9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -121,8 +121,10 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ - KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK | \ + KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \ + KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); @@ -2314,13 +2316,14 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) u64 val; /* - * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does - * not support modifying the guest vCPU model on the fly, e.g. changing - * the nVMX capabilities while L2 is running is nonsensical. Allow - * writes of the same value, e.g. to allow userspace to blindly stuff - * all MSRs when emulating RESET. + * Reject writes to immutable feature MSRs if the vCPU model is frozen, + * as KVM doesn't support modifying the guest vCPU model on the fly, + * e.g. changing the VMX capabilities MSRs while L2 is active is + * nonsensical. Allow writes of the same value, e.g. so that userspace + * can blindly stuff all MSRs when emulating RESET. */ - if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) && + kvm_is_immutable_feature_msr(index) && (do_get_msr(vcpu, index, &val) || *data != val)) return -EINVAL; @@ -4096,47 +4099,47 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (data & 0x1) { /* * Pairs with the smp_mb__after_atomic() in @@ -4149,7 +4152,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (unlikely(!sched_info_on())) return 1; @@ -4167,7 +4170,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; @@ -4175,7 +4178,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; /* only enable bit supported */ if (data & (-1ULL << 1)) @@ -4476,61 +4479,61 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.time; break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.msr_kvm_poll_control; break; @@ -4931,6 +4934,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; + if (kvm && !irqchip_split(kvm)) + r &= ~KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST; break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops.nested_ops->get_state ? @@ -6748,11 +6753,24 @@ split_irqchip_unlock: if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) break; + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && + (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)) + break; + + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && + !irqchip_split(kvm)) + break; + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) kvm->arch.x2apic_format = true; if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) kvm->arch.x2apic_broadcast_quirk_disabled = true; + if (cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_ENABLED; + if (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_DISABLED; + r = 0; break; case KVM_CAP_X86_DISABLE_EXITS: @@ -11609,8 +11627,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { int r = kvm_check_nested_events(vcpu); - WARN_ON_ONCE(r == -EBUSY); - if (r < 0) + if (r < 0 && r != -EBUSY) return 0; } @@ -12158,9 +12175,11 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) return; if (is_pae_paging(vcpu)) { + kvm_vcpu_srcu_read_lock(vcpu); for (i = 0 ; i < 4 ; i++) sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + kvm_vcpu_srcu_read_unlock(vcpu); } } @@ -13320,7 +13339,7 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) #endif kvm_mmu_pre_destroy_vm(kvm); - static_call_cond(kvm_x86_vm_pre_destroy)(kvm); + kvm_x86_call(vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 00de24f55b1f..ff20e62d98c6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -172,9 +172,20 @@ static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) indirect_branch_prediction_barrier(); } -static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +/* + * Disallow modifying CPUID and feature MSRs, which affect the core virtual CPU + * model exposed to the guest and virtualized by KVM, if the vCPU has already + * run or is in guest mode (L2). In both cases, KVM has already consumed the + * current virtual CPU model, and doesn't support "unwinding" to react to the + * new model. + * + * Note, the only way is_guest_mode() can be true with 'last_vmentry_cpu == -1' + * is if userspace sets CPUID and feature MSRs (to enable VMX/SVM), then sets + * nested state, and then attempts to set CPUID and/or feature MSRs *again*. + */ +static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu) { - return vcpu->arch.last_vmentry_cpu != -1; + return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); } static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) |
