diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-11 22:10:08 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-11 22:10:08 +0300 |
commit | e18a90427c4ef26e9208a8710b7d10eaf02bed48 (patch) | |
tree | 44d9c2585e9ae6a4c85c42517cd7d9904a130d34 | |
parent | 2ae08b36c06ea8df73a79f6b80ff7964e006e9e3 (diff) | |
parent | 19a7cc817a380f7a412d7d76e145e9e2bc47e52f (diff) | |
download | linux-e18a90427c4ef26e9208a8710b7d10eaf02bed48.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more kvm updates from Paolo Bonzini:
- Xen timer fixes
- Documentation formatting fixes
- Make rseq selftest compatible with glibc-2.35
- Fix handling of illegal LEA reg, reg
- Cleanup creation of debugfs entries
- Fix steal time cache handling bug
- Fixes for MMIO caching
- Optimize computation of number of LBRs
- Fix uninitialized field in guest_maxphyaddr < host_maxphyaddr path
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (26 commits)
KVM: x86/MMU: properly format KVM_CAP_VM_DISABLE_NX_HUGE_PAGES capability table
Documentation: KVM: extend KVM_CAP_VM_DISABLE_NX_HUGE_PAGES heading underline
KVM: VMX: Adjust number of LBR records for PERF_CAPABILITIES at refresh
KVM: VMX: Use proper type-safe functions for vCPU => LBRs helpers
KVM: x86: Refresh PMU after writes to MSR_IA32_PERF_CAPABILITIES
KVM: selftests: Test all possible "invalid" PERF_CAPABILITIES.LBR_FMT vals
KVM: selftests: Use getcpu() instead of sched_getcpu() in rseq_test
KVM: selftests: Make rseq compatible with glibc-2.35
KVM: Actually create debugfs in kvm_create_vm()
KVM: Pass the name of the VM fd to kvm_create_vm_debugfs()
KVM: Get an fd before creating the VM
KVM: Shove vcpu stats_id init into kvm_vcpu_init()
KVM: Shove vm stats_id init into kvm_create_vm()
KVM: x86/mmu: Add sanity check that MMIO SPTE mask doesn't overlap gen
KVM: x86/mmu: rename trace function name for asynchronous page fault
KVM: x86/xen: Stop Xen timer before changing IRQ
KVM: x86/xen: Initialize Xen timer only once
KVM: SVM: Disable SEV-ES support if MMIO caching is disable
KVM: x86/mmu: Fully re-evaluate MMIO caching when SPTE masks change
KVM: x86: Tag kvm_mmu_x86_module_init() with __init
...
-rw-r--r-- | Documentation/virt/kvm/api.rst | 10 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 28 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 29 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 13 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 31 | ||||
-rw-r--r-- | include/trace/events/kvm.h | 2 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/Makefile | 7 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/rseq_test.c | 58 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c | 17 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 81 |
19 files changed, 221 insertions, 129 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9788b19f9ff7..abd7c32126ce 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8262,15 +8262,15 @@ dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is available and supports the `KVM_PV_DUMP_CPU` subcommand. 8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES ---------------------------- +------------------------------------- -:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +:Capability: KVM_CAP_VM_DISABLE_NX_HUGE_PAGES :Architectures: x86 :Type: vm :Parameters: arg[0] must be 0. -:Returns 0 on success, -EPERM if the userspace process does not - have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been - created. +:Returns: 0 on success, -EPERM if the userspace process does not + have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been + created. This capability disables the NX huge pages mitigation for iTLB MULTIHIT. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8281d64a431..5ffa578cafe1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1704,7 +1704,7 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) -void kvm_mmu_x86_module_init(void); +void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 047c583596bb..b4eeb7c75dfa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4578,6 +4578,10 @@ static const struct mode_dual mode_dual_63 = { N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd) }; +static const struct instr_dual instr_dual_8d = { + D(DstReg | SrcMem | ModRM | NoAccess), N +}; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ F6ALU(Lock, em_add), @@ -4634,7 +4638,7 @@ static const struct opcode opcode_table[256] = { I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg), - D(ModRM | SrcMem | NoAccess | DstReg), + ID(0, &instr_dual_8d), I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), G(0, group1A), /* 0x90 - 0x97 */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e2ce3556915e..9dda989a1cf0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2284,10 +2284,12 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) struct kvm_lapic *apic = vcpu->arch.apic; u64 val; - if (apic_x2apic_mode(apic)) - kvm_lapic_msr_read(apic, offset, &val); - else + if (apic_x2apic_mode(apic)) { + if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm)) + return; + } else { val = kvm_lapic_get_reg(apic, offset); + } /* * ICR is a single 64-bit register when x2APIC is enabled. For legacy diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index a99acec925eb..6bdaacb6faa0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -6,6 +6,8 @@ #include "kvm_cache_regs.h" #include "cpuid.h" +extern bool __read_mostly enable_mmio_caching; + #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 06ac8c7cef67..eccddb136954 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4164,7 +4164,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { - trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); + trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { @@ -6697,11 +6697,15 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) /* * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as * its default value of -1 is technically undefined behavior for a boolean. + * Forward the module init call to SPTE code so that it too can handle module + * params that need to be resolved/snapshot. */ -void kvm_mmu_x86_module_init(void) +void __init kvm_mmu_x86_module_init(void) { if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); + + kvm_mmu_spte_module_init(); } /* diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 7314d27d57a4..2e08b2a45361 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -20,7 +20,9 @@ #include <asm/vmx.h> bool __read_mostly enable_mmio_caching = true; +static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); +EXPORT_SYMBOL_GPL(enable_mmio_caching); u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; @@ -43,6 +45,18 @@ u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; u8 __read_mostly shadow_phys_bits; +void __init kvm_mmu_spte_module_init(void) +{ + /* + * Snapshot userspace's desire to allow MMIO caching. Whether or not + * KVM can actually enable MMIO caching depends on vendor-specific + * hardware capabilities and other module params that can't be resolved + * until the vendor module is loaded, i.e. enable_mmio_caching can and + * will change when the vendor module is (re)loaded. + */ + allow_mmio_caching = enable_mmio_caching; +} + static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; @@ -340,10 +354,24 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) BUG_ON((u64)(unsigned)access_mask != access_mask); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + /* + * Reset to the original module param value to honor userspace's desire + * to (dis)allow MMIO caching. Update the param itself so that + * userspace can see whether or not KVM is actually using MMIO caching. + */ + enable_mmio_caching = allow_mmio_caching; if (!enable_mmio_caching) mmio_value = 0; /* + * The mask must contain only bits that are carved out specifically for + * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO + * generation. + */ + if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK)) + mmio_value = 0; + + /* * Disable MMIO caching if the MMIO value collides with the bits that * are used to hold the relocated GFN when the L1TF mitigation is * enabled. This should never fire as there is no known hardware that diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index cabe3fbb4f39..f3744eea45f5 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,8 +5,6 @@ #include "mmu_internal.h" -extern bool __read_mostly enable_mmio_caching; - /* * A MMU present SPTE is backed by actual memory and may or may not be present * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it @@ -125,6 +123,20 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); static_assert(!(SPTE_MMU_PRESENT_MASK & (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); +/* + * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the + * MMU-present bit. The generation obviously co-exists with the magic MMIO + * mask/value, and MMIO SPTEs are considered !MMU-present. + * + * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT + * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO + * and so they're off-limits for generation; additional checks ensure the mask + * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). + */ +#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) +static_assert(!(SPTE_MMIO_ALLOWED_MASK & + (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); + #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) @@ -450,6 +462,7 @@ static inline u64 restore_acc_track_spte(u64 spte) u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); +void __init kvm_mmu_spte_module_init(void); void kvm_mmu_reset_all_pte_masks(void); #endif diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b0e793e7d85c..28064060413a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -22,6 +22,7 @@ #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include "mmu.h" #include "x86.h" #include "svm.h" #include "svm_ops.h" @@ -2221,6 +2222,15 @@ void __init sev_hardware_setup(void) if (!sev_es_enabled) goto out; + /* + * SEV-ES requires MMIO caching as KVM doesn't have access to the guest + * instruction stream, i.e. can't emulate in response to a #NPF and + * instead relies on #NPF(RSVD) being reflected into the guest as #VC + * (the guest can then do a #VMGEXIT to request MMIO emulation). + */ + if (!enable_mmio_caching) + goto out; + /* Does the CPU support SEV-ES? */ if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 38f873cb6f2c..f3813dbacb9f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5034,13 +5034,16 @@ static __init int svm_hardware_setup(void) /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); - /* Note, SEV setup consumes npt_enabled. */ + svm_adjust_mmio_mask(); + + /* + * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which + * may be modified by svm_adjust_mmio_mask()). + */ sev_hardware_setup(); svm_hv_hardware_setup(); - svm_adjust_mmio_mask(); - for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 862c1a4d971b..c399637a3a79 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -171,13 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) -{ - struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); - - return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT); -} - static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) { struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); @@ -592,7 +585,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - if (cpuid_model_is_consistent(vcpu)) + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (cpuid_model_is_consistent(vcpu) && + (perf_capabilities & PMU_CAP_LBR_FMT)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; @@ -600,7 +595,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); - perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_mask = counter_mask; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index fb8e3480a9d7..24d58c2ffaa3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -6,6 +6,7 @@ #include <asm/kvm.h> #include <asm/intel_pt.h> +#include <asm/perf_event.h> #include "capabilities.h" #include "../kvm_cache_regs.h" @@ -104,15 +105,6 @@ static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) return pmu->version > 1; } -#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) -#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) - -void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); -bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); - -int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); -void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); - struct lbr_desc { /* Basic info about guest LBR records. */ struct x86_pmu_lbr records; @@ -542,6 +534,25 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) +{ + return &to_vmx(vcpu)->lbr_desc; +} + +static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) +{ + return &vcpu_to_lbr_desc(vcpu)->records; +} + +static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + return !!vcpu_to_lbr_records(vcpu)->nr; +} + +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); + static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 79a8a74b6b2a..205ebdc2b11b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3413,6 +3413,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; struct kvm_steal_time __user *st; struct kvm_memslots *slots; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; u64 steal; u32 version; @@ -3430,13 +3431,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) slots = kvm_memslots(vcpu->kvm); if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { - gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; - /* We rely on the fact that it fits in a single page. */ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) || + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || kvm_is_error_hva(ghc->hva) || !ghc->memslot) return; } @@ -3545,9 +3545,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.perf_capabilities = data; - + kvm_pmu_refresh(vcpu); return 0; - } + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -4714,6 +4714,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) struct kvm_steal_time __user *st; struct kvm_memslots *slots; static const u8 preempted = KVM_VCPU_PREEMPTED; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; /* * The vCPU can be marked preempted if and only if the VM-Exit was on @@ -4741,6 +4742,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) slots = kvm_memslots(vcpu->kvm); if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || kvm_is_error_hva(ghc->hva) || !ghc->memslot)) return; @@ -13019,6 +13021,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c fault.error_code = error_code; fault.nested_page_fault = false; fault.address = gva; + fault.async_page_fault = false; } vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a0c05ccbf4b1..280cb5dc7341 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -707,23 +707,24 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: - if (data->u.timer.port) { - if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { - r = -EINVAL; - break; - } - vcpu->arch.xen.timer_virq = data->u.timer.port; + if (data->u.timer.port && + data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { + r = -EINVAL; + break; + } + + if (!vcpu->arch.xen.timer.function) kvm_xen_init_timer(vcpu); - /* Restart the timer if it's set */ - if (data->u.timer.expires_ns) - kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, - data->u.timer.expires_ns - - get_kvmclock_ns(vcpu->kvm)); - } else if (kvm_xen_timer_enabled(vcpu)) { - kvm_xen_stop_timer(vcpu); - vcpu->arch.xen.timer_virq = 0; - } + /* Stop the timer (if it's running) before changing the vector */ + kvm_xen_stop_timer(vcpu); + vcpu->arch.xen.timer_virq = data->u.timer.port; + + /* Start the timer if the new value has a valid vector+expiry. */ + if (data->u.timer.port && data->u.timer.expires_ns) + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, + data->u.timer.expires_ns - + get_kvmclock_ns(vcpu->kvm)); r = 0; break; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 37e1e1a2d67d..3bd31ea23fee 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -282,7 +282,7 @@ DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, TP_ARGS(gva, gfn) ); -DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault, +DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_repeated_fault, TP_PROTO(u64 gva, u64 gfn), diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index c7f47429d6cd..4c122f1b1737 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -4,6 +4,8 @@ include ../../../build/Build.include all: top_srcdir = ../../../.. +include $(top_srcdir)/scripts/subarch.include +ARCH ?= $(SUBARCH) # For cross-builds to work, UNAME_M has to map to ARCH and arch specific # directories and targets in this Makefile. "uname -m" doesn't map to @@ -197,7 +199,8 @@ endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ - -I$(<D) -Iinclude/$(UNAME_M) -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES) + -I$(<D) -Iinclude/$(UNAME_M) -I ../rseq -I.. $(EXTRA_CFLAGS) \ + $(KHDR_INCLUDES) no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie) @@ -206,7 +209,7 @@ no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ pgste-option = $(call try-run, echo 'int main() { return 0; }' | \ $(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste) - +LDLIBS += -ldl LDFLAGS += -pthread $(no-pie-option) $(pgste-option) # After inclusion, $(OUTPUT) is defined and diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index a54d4d05a058..fac248a43666 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -20,15 +20,7 @@ #include "processor.h" #include "test_util.h" -static __thread volatile struct rseq __rseq = { - .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, -}; - -/* - * Use an arbitrary, bogus signature for configuring rseq, this test does not - * actually enter an rseq critical section. - */ -#define RSEQ_SIG 0xdeadbeef +#include "../rseq/rseq.c" /* * Any bug related to task migration is likely to be timing-dependent; perform @@ -49,12 +41,16 @@ static void guest_code(void) GUEST_SYNC(0); } -static void sys_rseq(int flags) +/* + * We have to perform direct system call for getcpu() because it's + * not available until glic 2.29. + */ +static void sys_getcpu(unsigned *cpu) { int r; - r = syscall(__NR_rseq, &__rseq, sizeof(__rseq), flags, RSEQ_SIG); - TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno)); + r = syscall(__NR_getcpu, cpu, NULL, NULL); + TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", errno, strerror(errno)); } static int next_cpu(int cpu) @@ -101,7 +97,7 @@ static void *migration_worker(void *__rseq_tid) atomic_inc(&seq_cnt); /* - * Ensure the odd count is visible while sched_getcpu() isn't + * Ensure the odd count is visible while getcpu() isn't * stable, i.e. while changing affinity is in-progress. */ smp_wmb(); @@ -142,10 +138,10 @@ static void *migration_worker(void *__rseq_tid) * check completes. * * 3. To ensure the read-side makes efficient forward progress, - * e.g. if sched_getcpu() involves a syscall. Stalling the - * read-side means the test will spend more time waiting for - * sched_getcpu() to stabilize and less time trying to hit - * the timing-dependent bug. + * e.g. if getcpu() involves a syscall. Stalling the read-side + * means the test will spend more time waiting for getcpu() + * to stabilize and less time trying to hit the timing-dependent + * bug. * * Because any bug in this area is likely to be timing-dependent, * run with a range of delays at 1us intervals from 1us to 10us @@ -218,7 +214,9 @@ int main(int argc, char *argv[]) calc_min_max_cpu(); - sys_rseq(0); + r = rseq_register_current_thread(); + TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)", + errno, strerror(errno)); /* * Create and run a dummy VM that immediately exits to userspace via @@ -238,9 +236,9 @@ int main(int argc, char *argv[]) /* * Verify rseq's CPU matches sched's CPU. Ensure migration - * doesn't occur between sched_getcpu() and reading the rseq - * cpu_id by rereading both if the sequence count changes, or - * if the count is odd (migration in-progress). + * doesn't occur between getcpu() and reading the rseq cpu_id + * by rereading both if the sequence count changes, or if the + * count is odd (migration in-progress). */ do { /* @@ -250,13 +248,13 @@ int main(int argc, char *argv[]) snapshot = atomic_read(&seq_cnt) & ~1; /* - * Ensure reading sched_getcpu() and rseq.cpu_id - * complete in a single "no migration" window, i.e. are - * not reordered across the seq_cnt reads. + * Ensure calling getcpu() and reading rseq.cpu_id complete + * in a single "no migration" window, i.e. are not reordered + * across the seq_cnt reads. */ smp_rmb(); - cpu = sched_getcpu(); - rseq_cpu = READ_ONCE(__rseq.cpu_id); + sys_getcpu(&cpu); + rseq_cpu = rseq_current_cpu_raw(); smp_rmb(); } while (snapshot != atomic_read(&seq_cnt)); @@ -267,9 +265,9 @@ int main(int argc, char *argv[]) /* * Sanity check that the test was able to enter the guest a reasonable * number of times, e.g. didn't get stalled too often/long waiting for - * sched_getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a - * fairly conservative ratio on x86-64, which can do _more_ KVM_RUNs - * than migrations given the 1us+ delay in the migration task. + * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly + * conservative ratio on x86-64, which can do _more_ KVM_RUNs than + * migrations given the 1us+ delay in the migration task. */ TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), "Only performed %d KVM_RUNs, task stalled too much?\n", i); @@ -278,7 +276,7 @@ int main(int argc, char *argv[]) kvm_vm_free(vm); - sys_rseq(RSEQ_FLAG_UNREGISTER); + rseq_unregister_current_thread(); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 6ec901dab61e..069589c52f41 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -59,6 +59,7 @@ int main(int argc, char *argv[]) int ret; union cpuid10_eax eax; union perf_capabilities host_cap; + uint64_t val; host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES); host_cap.capabilities &= (PMU_CAP_FW_WRITES | PMU_CAP_LBR_FMT); @@ -91,11 +92,17 @@ int main(int argc, char *argv[]) vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format); ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format); - /* testcase 3, check invalid LBR format is rejected */ - /* Note, on Arch LBR capable platforms, LBR_FMT in perf capability msr is 0x3f, - * to avoid the failure, use a true invalid format 0x30 for the test. */ - ret = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0x30); - TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); + /* + * Testcase 3, check that an "invalid" LBR format is rejected. Only an + * exact match of the host's format (and 0/disabled) is allowed. + */ + for (val = 1; val <= PMU_CAP_LBR_FMT; val++) { + if (val == (host_cap.capabilities & PMU_CAP_LBR_FMT)) + continue; + + ret = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val); + TEST_ASSERT(!ret, "Bad LBR FMT = 0x%lx didn't fail", val); + } printf("Completed perf capability tests.\n"); kvm_vm_free(vm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 32896c845ffe..515dfe9d3bcf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -484,6 +484,10 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu->last_used_slot = NULL; + + /* Fill the stats id string for the vcpu */ + snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", + task_pid_nr(current), id); } static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -1017,21 +1021,21 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm) } } -static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) +static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) { static DEFINE_MUTEX(kvm_debugfs_lock); struct dentry *dent; char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; - int i, ret; + int i, ret = -ENOMEM; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; if (!debugfs_initialized()) return 0; - snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); + snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname); mutex_lock(&kvm_debugfs_lock); dent = debugfs_lookup(dir_name, kvm_debugfs_dir); if (dent) { @@ -1050,13 +1054,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) sizeof(*kvm->debugfs_stat_data), GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) - return -ENOMEM; + goto out_err; for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { pdesc = &kvm_vm_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) - return -ENOMEM; + goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; @@ -1071,7 +1075,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) pdesc = &kvm_vcpu_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) - return -ENOMEM; + goto out_err; stat_data->kvm = kvm; stat_data->desc = pdesc; @@ -1083,12 +1087,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) } ret = kvm_arch_create_vm_debugfs(kvm); - if (ret) { - kvm_destroy_vm_debugfs(kvm); - return i; - } + if (ret) + goto out_err; return 0; +out_err: + kvm_destroy_vm_debugfs(kvm); + return ret; } /* @@ -1119,7 +1124,7 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) return 0; } -static struct kvm *kvm_create_vm(unsigned long type) +static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; @@ -1155,6 +1160,9 @@ static struct kvm *kvm_create_vm(unsigned long type) */ kvm->debugfs_dentry = ERR_PTR(-ENOENT); + snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", + task_pid_nr(current)); + if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) @@ -1205,7 +1213,7 @@ static struct kvm *kvm_create_vm(unsigned long type) r = kvm_arch_post_init_vm(kvm); if (r) - goto out_err; + goto out_err_mmu_notifier; mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); @@ -1221,12 +1229,18 @@ static struct kvm *kvm_create_vm(unsigned long type) */ if (!try_module_get(kvm_chardev_ops.owner)) { r = -ENODEV; - goto out_err; + goto out_err_mmu_notifier; } + r = kvm_create_vm_debugfs(kvm, fdname); + if (r) + goto out_err; + return kvm; out_err: + module_put(kvm_chardev_ops.owner); +out_err_mmu_notifier: #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); @@ -3916,10 +3930,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto unlock_vcpu_destroy; - /* Fill the stats id string for the vcpu */ - snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", - task_pid_nr(current), id); - /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); @@ -4886,28 +4896,30 @@ EXPORT_SYMBOL_GPL(file_is_kvm); static int kvm_dev_ioctl_create_vm(unsigned long type) { - int r; + char fdname[ITOA_MAX_LEN + 1]; + int r, fd; struct kvm *kvm; struct file *file; - kvm = kvm_create_vm(type); - if (IS_ERR(kvm)) - return PTR_ERR(kvm); + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + snprintf(fdname, sizeof(fdname), "%d", fd); + + kvm = kvm_create_vm(type, fdname); + if (IS_ERR(kvm)) { + r = PTR_ERR(kvm); + goto put_fd; + } + #ifdef CONFIG_KVM_MMIO r = kvm_coalesced_mmio_init(kvm); if (r < 0) goto put_kvm; #endif - r = get_unused_fd_flags(O_CLOEXEC); - if (r < 0) - goto put_kvm; - - snprintf(kvm->stats_id, sizeof(kvm->stats_id), - "kvm-%d", task_pid_nr(current)); - file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { - put_unused_fd(r); r = PTR_ERR(file); goto put_kvm; } @@ -4918,18 +4930,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ - if (kvm_create_vm_debugfs(kvm, r) < 0) { - put_unused_fd(r); - fput(file); - return -ENOMEM; - } kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); - fd_install(r, file); - return r; + fd_install(fd, file); + return fd; put_kvm: kvm_put_kvm(kvm); +put_fd: + put_unused_fd(fd); return r; } |