diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 70 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/smm.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 71 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 47 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 126 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 13 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 65 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 79 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 78 |
16 files changed, 383 insertions, 211 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5e4d4934c0d3..ecd85f4801cc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -21,7 +21,7 @@ #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -236,7 +236,7 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp struct kvm_cpuid_entry2 *entry; u32 base; - for_each_possible_hypervisor_cpuid_base(base) { + for_each_possible_cpuid_base_hypervisor(base) { entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { @@ -1427,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1444,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1763,7 +1761,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..f2b36d32ef40 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + /* * Checking root.hpa is sufficient even when KVM has mirror root. * We can have either: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 63bb77ee1bb1..8d1b632e33d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; + if (WARN_ON_ONCE(range->end <= range->start)) + return false; + + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); + + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + /* Unmap the old attribute page. */ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) range->attr_filter = KVM_FILTER_SHARED; @@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, return kvm_unmap_gfn_range(kvm, range); } -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} - -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; -} static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7cc0564f5f97..21a3b8166242 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -325,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 699e551ec93b..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 65fd245a9953..067f8e3f5a0d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -330,7 +331,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -796,12 +797,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +824,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +900,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +937,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +957,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +972,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..1aa0f07d3a63 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -26,6 +26,7 @@ #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> #include <asm/sev.h> #include "mmu.h" @@ -2933,6 +2934,7 @@ void __init sev_set_cpu_caps(void) void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -3059,6 +3061,15 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; + + if (!sev_enabled) + return; + + /* + * Do both SNP and SEV initialization at KVM module load. + */ + init_args.probe = true; + sev_platform_init(&init_args); } void sev_hardware_unsetup(void) @@ -3074,6 +3085,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -3119,7 +3132,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -3173,9 +3186,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3184,18 +3202,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3266,11 +3290,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5d0c5c3300b..67fee545d42a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -31,6 +31,7 @@ #include <linux/string_choices.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -475,24 +476,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -566,7 +561,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } @@ -579,15 +574,15 @@ static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } @@ -607,9 +602,6 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); - - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -619,7 +611,7 @@ static int svm_enable_virtualization_cpu(void) uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -629,9 +621,9 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -652,13 +644,12 @@ static int svm_enable_virtualization_cpu(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -687,9 +678,6 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); - return 0; } @@ -1518,6 +1506,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1550,6 +1595,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -2149,14 +2199,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2167,17 +2216,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2231,6 +2274,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -5036,6 +5083,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -5061,6 +5110,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } @@ -5232,7 +5282,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..f16b068c4228 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -335,6 +335,8 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ccda95e53f62..ba736cbb0587 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -11,6 +11,13 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#ifdef CREATE_TRACE_POINTS +#define tracing_kvm_rip_read(vcpu) ({ \ + typeof(vcpu) __vcpu = vcpu; \ + __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \ + }) +#endif + /* * Tracepoint for guest mode entry. */ @@ -28,7 +35,7 @@ TRACE_EVENT(kvm_entry, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->rip = kvm_rip_read(vcpu); + __entry->rip = tracing_kvm_rip_read(vcpu); __entry->immediate_exit = force_immediate_exit; kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info, @@ -319,7 +326,7 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->guest_rip = tracing_kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ __entry->requests = READ_ONCE(vcpu->requests); \ @@ -423,7 +430,7 @@ TRACE_EVENT(kvm_page_fault, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->guest_rip = kvm_rip_read(vcpu); + __entry->guest_rip = tracing_kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5504d9e9fd32..d268224227f0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6,6 +6,7 @@ #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include <asm/msr.h> #include "x86.h" #include "cpuid.h" @@ -7202,8 +7203,8 @@ static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 77012b2eca0e..231a9633359c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -13,6 +13,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" @@ -279,9 +280,9 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, local_irq_disable(); if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { if (read) - rdmsrl(index, msr_info->data); + rdmsrq(index, msr_info->data); else - wrmsrl(index, msr_info->data); + wrmsrq(index, msr_info->data); __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); return true; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43..d70e5b90087d 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vmx(vcpu)->pi_desc); @@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); list_add_tail(&vmx->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) @@ -274,6 +297,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -312,21 +336,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -334,11 +345,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + + enable_remapped_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -346,6 +358,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 9961e07cf071..df1d0cf76947 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -2,6 +2,7 @@ /* Copyright(c) 2021 Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/msr.h> #include <asm/sgx.h> #include "x86.h" @@ -411,16 +412,16 @@ void setup_default_sgx_lepubkeyhash(void) * MSRs exist but are read-only (locked and not writable). */ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || - rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; } else { /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c5766467a61..157c23db22be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -46,6 +46,7 @@ #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> @@ -273,6 +274,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: @@ -380,9 +382,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -393,7 +395,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -1063,7 +1065,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, * provide that period, so a CPU could write host's record into * guest's memory. */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); @@ -1192,13 +1194,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1206,13 +1208,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1225,9 +1227,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); + wrmsrq(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } @@ -1248,7 +1250,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. */ if (vmx->pt_desc.host.ctl) - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1338,7 +1340,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -1362,7 +1364,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -1382,7 +1384,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; @@ -1394,7 +1396,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); if (vmx->guest_state_loaded) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; } @@ -1403,7 +1405,7 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); if (vmx->guest_state_loaded) - wrmsrl(MSR_KERNEL_GS_BASE, data); + wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } @@ -2574,7 +2576,7 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; - rdmsrl(msr, allowed); + rdmsrq(msr, allowed); return ctl_opt & allowed; } @@ -2746,7 +2748,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, break; } - rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) @@ -2766,7 +2768,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; - rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + rdmsrq(MSR_IA32_VMX_MISC, misc_msr); vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -2850,7 +2852,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; @@ -4391,7 +4393,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { @@ -6745,7 +6747,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -7052,7 +7054,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * the #NM exception. */ if (is_xfd_nm_fault(vcpu)) - rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) @@ -7307,7 +7309,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) - vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. @@ -7318,7 +7320,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) - native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } @@ -7358,10 +7360,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. + * + * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, + * and is affected by MMIO Stale Data. In such cases mitigation in only + * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mmio_stale_data_clear) && + else if (static_branch_unlikely(&cpu_buf_vm_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); @@ -7700,6 +7706,7 @@ int vmx_vm_init(struct kvm *kvm) case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -7959,7 +7966,7 @@ static __init u64 vmx_get_perf_capabilities(void) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); @@ -8508,7 +8515,7 @@ __init int vmx_hardware_setup(void) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c841817a914a..5bdb5b854924 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -578,7 +578,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(kvm_uret_msrs_list[slot], values->host); + wrmsrq(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -590,10 +590,10 @@ static int kvm_probe_user_return_msr(u32 msr) int ret; preempt_disable(); - ret = rdmsrl_safe(msr, &val); + ret = rdmsrq_safe(msr, &val); if (ret) goto out; - ret = wrmsrl_safe(msr, val); + ret = wrmsrq_safe(msr, val); out: preempt_enable(); return ret; @@ -630,7 +630,7 @@ static void kvm_user_return_msr_cpu_online(void) int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrl_safe(kvm_uret_msrs_list[i], &value); + rdmsrq_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } @@ -644,7 +644,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; @@ -1174,7 +1174,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); + wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); } if (cpu_feature_enabled(X86_FEATURE_PKU) && @@ -1205,7 +1205,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, kvm_host.xss); + wrmsrq(MSR_IA32_XSS, kvm_host.xss); } } @@ -1584,7 +1584,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1618,6 +1618,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -1660,7 +1662,7 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, *data = MSR_PLATFORM_INFO_CPUID_FAULT; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(index, data); + rdmsrq_safe(index, data); break; default: return kvm_x86_call(get_feature_msr)(index, data); @@ -3827,7 +3829,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, data); + wrmsrq(MSR_IA32_PRED_CMD, data); break; } case MSR_IA32_FLUSH_CMD: @@ -3840,7 +3842,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); break; case MSR_EFER: return set_efer(vcpu, msr_info); @@ -4597,7 +4599,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } -static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) { return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; } @@ -9736,7 +9738,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * with an exception. PAT[0] is set to WB on RESET and also by the * kernel, i.e. failure indicates a kernel bug or broken firmware. */ - if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("host PAT[0] is not WB\n"); return -EIO; @@ -9770,15 +9772,15 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } - rdmsrl_safe(MSR_EFER, &kvm_host.efer); + rdmsrq_safe(MSR_EFER, &kvm_host.efer); if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, kvm_host.xss); + rdmsrq(MSR_IA32_XSS, kvm_host.xss); kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); r = ops->hardware_setup(); if (r != 0) @@ -10974,7 +10976,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) switch_fpu_return(); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -11060,7 +11062,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_call(handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* * Consume any pending interrupts, including the possible source of @@ -11098,7 +11100,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -11492,7 +11495,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; - u32 sync_valid_fields; + u64 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); @@ -11786,6 +11789,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11799,6 +11804,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); @@ -13552,25 +13559,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13580,9 +13589,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13590,12 +13599,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } @@ -13608,7 +13623,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); @@ -13652,12 +13668,12 @@ int kvm_spec_ctrl_test_value(u64 value) local_irq_save(flags); - if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; - else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) ret = 1; else - wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); local_irq_restore(flags); |