diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 61 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/reverse_cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 60 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 72 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 15 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 26 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 28 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 182 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 77 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 15 |
19 files changed, 423 insertions, 207 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 43b1df7ec183..1bb5e8f6c63e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -758,6 +758,12 @@ void kvm_set_cpu_caps(void) if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(VERW_CLEAR)); + + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0021_ECX, + F(TSA_SQ_NO) | F(TSA_L1_NO) + ); + /* * Hide RDTSCP and RDPID if either feature is reported as supported but * probing MSR_TSC_AUX failed. This is purely a sanity check and @@ -1243,7 +1249,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: - entry->ebx = entry->ecx = entry->edx = 0; + entry->ebx = entry->edx = 0; /* * Pass down these bits: * EAX 0 NNDBP, Processor ignores nested data breakpoints @@ -1259,6 +1265,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax |= BIT(2); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) entry->eax |= BIT(6); + cpuid_entry_override(entry, CPUID_8000_0021_ECX); break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 28555bbd52e8..cb0a531e13c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1406,8 +1406,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1528,8 +1527,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } @@ -1581,7 +1579,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1646,7 +1644,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = APIC_BUS_FREQUENCY; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7f57dce5c828..9aae76b74417 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -587,7 +587,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(apic->apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vec); + static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -632,7 +632,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(apic->apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -640,6 +640,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) + return; + + static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); +} +EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { /* This may race with setting of irr in __apic_accept_irq() and @@ -2315,11 +2326,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) data &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg64(apic, APIC_ICR, data); + if (kvm_x86_ops.x2apic_icr_is_split) { + kvm_lapic_set_reg(apic, APIC_ICR, data); + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); + } else { + kvm_lapic_set_reg64(apic, APIC_ICR, data); + } trace_kvm_apic_write(APIC_ICR, data); return 0; } +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) +{ + if (kvm_x86_ops.x2apic_icr_is_split) + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; + + return kvm_lapic_get_reg64(apic, APIC_ICR); +} + /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { @@ -2337,7 +2362,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) - WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR))); + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } @@ -2540,7 +2565,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(-1); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2760,18 +2785,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, /* * In x2APIC mode, the LDR is fixed and based on the id. And - * ICR is internally a single 64-bit register, but needs to be - * split to ICR+ICR2 in userspace for backwards compatibility. + * if the ICR is _not_ split, ICR is internally a single 64-bit + * register, but needs to be split to ICR+ICR2 in userspace for + * backwards compatibility. */ - if (set) { + if (set) *ldr = kvm_apic_calc_x2apic_ldr(*id); - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); - } else { - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + if (!kvm_x86_ops.x2apic_icr_is_split) { + if (set) { + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + } else { + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + } } } @@ -2829,7 +2858,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -2971,7 +3000,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) u32 low; if (reg == APIC_ICR) { - *data = kvm_lapic_get_reg64(apic, APIC_ICR); + *data = kvm_x2apic_icr_read(apic); return 0; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a5ac4a5a5179..e5d2dc58fcf8 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -122,6 +122,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); +void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index e43909d6504a..7fbd24fbf363 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -14,6 +14,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_2_EDX, + CPUID_8000_0021_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -45,6 +46,10 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* CPUID level 0x80000021 (ECX) */ +#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1) +#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2) + struct cpuid_reg { u32 function; u32 index; @@ -71,6 +76,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, + [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, }; /* @@ -107,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(SGX2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); + KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO); + KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO); default: return x86_feature; } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index fb125b54ee68..0adbf0677b7c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -839,7 +839,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -915,6 +915,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || @@ -952,6 +953,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -970,33 +973,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -1012,6 +988,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d8e192ad5953..0e0bc2c46b51 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1755,6 +1755,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) struct kvm_vcpu *src_vcpu; unsigned long i; + if (src->created_vcpus != atomic_read(&src->online_vcpus) || + dst->created_vcpus != atomic_read(&dst->online_vcpus)) + return -EBUSY; + if (!sev_es_guest(src)) return 0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a96facc05139..5a6bd9d5cceb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1426,7 +1426,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) { int i; - for_each_online_cpu(i) + for_each_possible_cpu(i) cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); } @@ -1920,11 +1920,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -3035,10 +3035,21 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * AMD changed the architectural behavior of bits 5:2. On CPUs + * without BusLockTrap, bits 5:2 control "external pins", but + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed + * the guest to set bits 5:2 despite not actually virtualizing + * Performance-Monitoring/Breakpoint external pins. Drop bits + * 5:2 for backwards compatibility. + */ + data &= ~GENMASK(5, 2); + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3065,7 +3076,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; @@ -3953,6 +3964,9 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + /* * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM * can't read guest memory (dereference memslots) to decode the WRMSR. @@ -3970,6 +3984,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -3977,15 +4003,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_svm *svm = to_svm(vcpu); bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4004,9 +4033,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * is enough to force an immediate vmexit. */ disable_nmi_singlestep(svm); - smp_send_reschedule(vcpu->cpu); + force_immediate_exit = true; } + if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); + pre_svm_run(vcpu); sync_lapic_to_cr8(vcpu); @@ -4020,13 +4052,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_hv_update_vp_id(svm->vmcb, vcpu); /* - * Run with all-zero DR6 unless needed, so that we can get the exact cause - * of a #DB. + * Run with all-zero DR6 unless the guest can write DR6 freely, so that + * KVM can get the exact cause of a #DB. Note, loading guest DR6 from + * KVM's snapshot is only necessary when DR accesses won't exit. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) + svm_set_dr6(vcpu, vcpu->arch.dr6); + else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -4103,9 +4136,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - return svm_exit_handlers_fastpath(vcpu); } @@ -4826,6 +4856,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + + .x2apic_icr_is_split = true, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, @@ -4847,8 +4879,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .request_immediate_exit = __kvm_request_immediate_exit, - .sched_in = svm_sched_in, .nested_ops = &svm_nested_ops, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 4cb1425900c6..a7f2faea8858 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -539,7 +539,7 @@ static inline bool is_x2apic_msrpm_offset(u32 offset) /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) extern bool dump_invalid_vmcb; diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 5be9a63f09ff..48b72625cc45 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -166,13 +166,12 @@ SYM_FUNC_START(__svm_vcpu_run) #endif mov VCPU_RDI(%_ASM_DI), %_ASM_DI - /* Enter guest mode */ - sti + /* Clobbers EFLAGS.ZF */ + VM_CLEAR_CPU_BUFFERS + /* Enter guest mode */ 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -336,12 +335,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX - /* Enter guest mode */ - sti + /* Clobbers EFLAGS.ZF */ + VM_CLEAR_CPU_BUFFERS + /* Enter guest mode */ 1: vmrun %_ASM_AX - -2: cli +2: /* Pop @svm to RDI, guest registers have been saved already. */ pop %_ASM_DI diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6c1dcf44c4fa..ab407bc00d84 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,20 +15,23 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), + TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit), + TP_ARGS(vcpu, force_immediate_exit), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned long, rip ) + __field( bool, immediate_exit ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; __entry->rip = kvm_rip_read(vcpu); + __entry->immediate_exit = force_immediate_exit; ), - TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) + TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, + __entry->immediate_exit ? "[immediate exit]" : "") ); /* diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8052f8b7d8e1..2c3cf4351c4c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2532,10 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & + vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); + vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -3022,7 +3023,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && - CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || + CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && @@ -3402,7 +3404,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -4374,6 +4376,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + /* + * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. + * Writes to DEBUGCTL that aren't intercepted by L1 are immediately + * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into + * vmcs02 doesn't strictly track vmcs12. + */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); @@ -4564,7 +4572,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmx_guest_debugctl_write(vcpu, 0); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) @@ -4619,6 +4627,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); } + /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ + vmx_reload_guest_debugctl(vcpu); + /* * Note that calling vmx_set_{efer,cr0,cr4} is important as they * handle a variety of side effects to KVM's software model. @@ -4839,6 +4850,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } + if (vmx->nested.update_vmcs01_hwapic_isr) { + vmx->nested.update_vmcs01_hwapic_isr = false; + kvm_apic_update_hwapic_isr(vcpu); + } + if ((vm_exit_reason != -1) && (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) vmx->nested.need_vmcs12_to_shadow_sync = true; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 220cdbe1e286..76d3ed8abf6a 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -672,11 +672,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) */ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) { - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + u64 data = vmx_guest_debugctl_read(); if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { data &= ~DEBUGCTLMSR_LBR; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); } } @@ -746,7 +746,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); - if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) goto warn; if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) goto warn; @@ -769,7 +769,7 @@ warn: static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) { - if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) intel_pmu_release_guest_lbr_event(vcpu); } diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1b56c5e5c9fb..0d04382f37af 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -272,6 +272,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -310,21 +311,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -332,11 +320,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + enable_remapped_mode = false; + + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -344,6 +333,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 460ba48eb66c..22e4c9bbbcb4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -49,6 +49,8 @@ #include <asm/virtext.h> #include <asm/vmx.h> +#include <trace/events/ipi.h> + #include "capabilities.h" #include "cpuid.h" #include "evmcs.h" @@ -705,14 +707,19 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } -static void crash_vmclear_local_loaded_vmcss(void) +static void vmx_emergency_disable(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) + loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); + if (v->shadow_vmcs) + vmcs_clear(v->shadow_vmcs); + } + + __cpu_emergency_vmxoff(); } static void __loaded_vmcs_clear(void *arg) @@ -1223,8 +1230,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; - vmx->req_immediate_exit = false; - /* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions @@ -1418,13 +1423,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, */ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -2031,7 +2032,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + msr_info->data = vmx_guest_debugctl_read(); break; default: find_uret_msr: @@ -2056,7 +2057,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } -static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) +u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { u64 debugctl = 0; @@ -2068,9 +2069,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (boot_cpu_has(X86_FEATURE_RTM) && + (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM))) + debugctl |= DEBUGCTLMSR_RTM_DEBUG; + return debugctl; } +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) +{ + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); + if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); + } + return !invalid; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2139,31 +2156,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: { - u64 invalid; - - invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); - if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - } - - if (invalid) + case MSR_IA32_DEBUGCTLMSR: + if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1; + data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); return 0; - } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -4749,7 +4757,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + vmx_guest_debugctl_write(&vmx->vcpu, 0); if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); @@ -5929,22 +5938,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->req_immediate_exit && - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { - kvm_lapic_expired_hv_timer(vcpu); + /* + * In the *extremely* unlikely scenario that this is a spurious VM-Exit + * due to the timer expiring while it was "soft" disabled, just eat the + * exit and re-enter the guest. + */ + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) return EXIT_FASTPATH_REENTER_GUEST; - } - return EXIT_FASTPATH_NONE; + /* + * If the timer expired because KVM used it to force an immediate exit, + * then mission accomplished. + */ + if (force_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + + /* + * If L2 is active, go down the slow path as emulating the guest timer + * expiration likely requires synthesizing a nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; } static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - handle_fastpath_preemption_timer(vcpu); + /* + * This non-fastpath handler is reached if and only if the preemption + * timer was being used to emulate a guest timer while L2 is active. + * All other scenarios are supposed to be handled in the fastpath. + */ + WARN_ON_ONCE(!is_guest_mode(vcpu)); + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -6702,11 +6735,27 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(int max_isr) +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; + /* + * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI + * is only relevant for if and only if Virtual Interrupt Delivery is + * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's + * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested + * VM-Exit, otherwise L1 with run with a stale SVI. + */ + if (is_guest_mode(vcpu)) { + /* + * KVM is supposed to forward intercepted L2 EOIs to L1 if VID + * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. + */ + to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; + return; + } + if (max_isr == -1) max_isr = 0; @@ -7051,13 +7100,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->req_immediate_exit) { + if (force_immediate_exit) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (vmx->hv_deadline_tsc != -1) { @@ -7110,13 +7159,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, barrier_nospec(); } -static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { + /* + * If L2 is active, some VMX preemption timer exits can be handled in + * the fastpath even, all other exits must use the slow path. + */ + if (is_guest_mode(vcpu) && + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + return EXIT_FASTPATH_NONE; + switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: - return handle_fastpath_preemption_timer(vcpu); + return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); default: return EXIT_FASTPATH_NONE; } @@ -7138,7 +7196,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) - mds_clear_cpu_buffers(); + x86_clear_cpu_buffers(); vmx_disable_fb_clear(vmx); @@ -7155,8 +7213,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7182,7 +7241,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -7201,6 +7260,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) + set_debugreg(vcpu->arch.dr6, 6); + + if (run_flags & KVM_RUN_LOAD_DEBUGCTL) + vmx_reload_guest_debugctl(vcpu); + /* * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time @@ -7220,10 +7285,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7241,7 +7302,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) - vmx_update_hv_timer(vcpu); + vmx_update_hv_timer(vcpu, force_immediate_exit); + else if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); kvm_wait_lapic_expire(vcpu); @@ -7257,8 +7320,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* @@ -7315,10 +7378,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - - return vmx_exit_handlers_fastpath(vcpu); + return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } static void vmx_vcpu_free(struct kvm_vcpu *vcpu) @@ -7825,11 +7885,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->req_immediate_exit = true; -} - static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, struct x86_instruction_info *info) { @@ -8150,6 +8205,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, + .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM, + .update_exception_bitmap = vmx_update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, @@ -8199,6 +8256,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .enable_nmi_window = vmx_enable_nmi_window, .enable_irq_window = vmx_enable_irq_window, .update_cr8_intercept = vmx_update_cr8_intercept, + + .x2apic_icr_is_split = false, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, @@ -8232,8 +8291,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .request_immediate_exit = vmx_request_immediate_exit, - .sched_in = vmx_sched_in, .cpu_dirty_log_size = PML_ENTITY_NUM, @@ -8490,7 +8547,6 @@ static __init int hardware_setup(void) if (!enable_preemption_timer) { vmx_x86_ops.set_hv_timer = NULL; vmx_x86_ops.cancel_hv_timer = NULL; - vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; @@ -8551,8 +8607,7 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); + cpu_emergency_unregister_virt_callback(vmx_emergency_disable); vmx_cleanup_l1d_flush(); } @@ -8626,8 +8681,7 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); + cpu_emergency_register_virt_callback(vmx_emergency_disable); vmx_check_vmcs12_offsets(); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9e0bb98b116d..dc6f06326648 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -189,6 +189,7 @@ struct nested_vmx { bool reload_vmcs01_apic_access_page; bool update_vmcs01_cpu_dirty_logging; bool update_vmcs01_apicv_status; + bool update_vmcs01_hwapic_isr; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -342,8 +343,6 @@ struct vcpu_vmx { unsigned int ple_window; bool ple_window_dirty; - bool req_immediate_exit; - /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -351,8 +350,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included @@ -445,6 +442,32 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); + +static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) +{ + WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM); + + val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM; + vmcs_write64(GUEST_IA32_DEBUGCTL, val); +} + +static inline u64 vmx_guest_debugctl_read(void) +{ + return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM; +} + +static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) +{ + u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); + + if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM)) + return; + + vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM); +} + /* * Note, early Intel manuals have the write-low and read-high bitmap offsets * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19ce8199f347..f542ab5e8698 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1614,7 +1614,7 @@ static unsigned int num_msr_based_features; ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1653,6 +1653,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -3571,7 +3573,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu) int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; @@ -3623,15 +3624,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; @@ -3811,16 +3810,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3847,9 +3843,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -10584,12 +10578,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - smp_send_reschedule(vcpu->cpu); -} -EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); - /* * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without @@ -10603,6 +10591,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); fastpath_t exit_fastpath; + u64 run_flags, debug_ctl; bool req_immediate_exit = false; @@ -10823,9 +10812,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } + run_flags = 0; if (req_immediate_exit) { + run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_request_immediate_exit)(vcpu); } fpregs_assert_state_consistent(); @@ -10841,10 +10831,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + run_flags |= KVM_RUN_LOAD_GUEST_DR6; } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } + /* + * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL + * can be modified in IRQ context, e.g. via SMP function calls. Inform + * vendor code if any host-owned bits were changed, e.g. so that the + * value loaded into hardware while running the guest can be updated. + */ + debug_ctl = get_debugctlmsr(); + if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && + !vcpu->arch.guest_state_protected) + run_flags |= KVM_RUN_LOAD_DEBUGCTL; + vcpu->arch.host_debugctl = debug_ctl; + guest_timing_enter_irqoff(); for (;;) { @@ -10857,7 +10862,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10869,6 +10874,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) break; } + run_flags = 0; + /* Note, VM-Exits that go down the "slow" path are accounted below. */ ++vcpu->stat.exits; } @@ -11460,6 +11467,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11473,6 +11482,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); @@ -13376,16 +13387,22 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); - if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13395,9 +13412,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13405,11 +13422,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } @@ -13422,7 +13446,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..f3554bf05201 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -331,6 +331,18 @@ extern bool report_ignored_msrs; extern bool eager_page_split; +static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 684a39df60d9..8e38f976feb4 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1536,8 +1536,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, { struct kvm_vcpu *vcpu; - if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) - return -EINVAL; + /* + * Don't check for the port being within range of max_evtchn_port(). + * Userspace can configure what ever targets it likes; events just won't + * be delivered if/while the target is invalid, just like userspace can + * configure MSIs which target non-existent APICs. + * + * This allow on Live Migration and Live Update, the IRQ routing table + * can be restored *independently* of other things like creating vCPUs, + * without imposing an ordering dependency on userspace. In this + * particular case, the problematic ordering would be with setting the + * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096 + * instead of 1024 event channels. + */ /* We only support 2 level event channels for now */ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) |