summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-22 00:31:43 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-22 00:31:43 +0300
commit3e10585335b7967326ca7b4118cada0d2d00a2ab (patch)
treee1655bc4f093f7de3a54dc3b2d83a54159aca10b /arch/x86
parent9c5b80b795e9c847a7b7f5e63c6bcf07873fbcdf (diff)
parent8c6e67bec3192f16fa624203c8131e10cc4814ba (diff)
downloadlinux-3e10585335b7967326ca7b4118cada0d2d00a2ab.tar.xz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "x86: - Support for userspace to emulate Xen hypercalls - Raise the maximum number of user memslots - Scalability improvements for the new MMU. Instead of the complex "fast page fault" logic that is used in mmu.c, tdp_mmu.c uses an rwlock so that page faults are concurrent, but the code that can run against page faults is limited. Right now only page faults take the lock for reading; in the future this will be extended to some cases of page table destruction. I hope to switch the default MMU around 5.12-rc3 (some testing was delayed due to Chinese New Year). - Cleanups for MAXPHYADDR checks - Use static calls for vendor-specific callbacks - On AMD, use VMLOAD/VMSAVE to save and restore host state - Stop using deprecated jump label APIs - Workaround for AMD erratum that made nested virtualization unreliable - Support for LBR emulation in the guest - Support for communicating bus lock vmexits to userspace - Add support for SEV attestation command - Miscellaneous cleanups PPC: - Support for second data watchpoint on POWER10 - Remove some complex workarounds for buggy early versions of POWER9 - Guest entry/exit fixes ARM64: - Make the nVHE EL2 object relocatable - Cleanups for concurrent translation faults hitting the same page - Support for the standard TRNG hypervisor call - A bunch of small PMU/Debug fixes - Simplification of the early init hypercall handling Non-KVM changes (with acks): - Detection of contended rwlocks (implemented only for qrwlocks, because KVM only needs it for x86) - Allow __DISABLE_EXPORTS from assembly code - Provide a saner follow_pfn replacements for modules" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (192 commits) KVM: x86/xen: Explicitly pad struct compat_vcpu_info to 64 bytes KVM: selftests: Don't bother mapping GVA for Xen shinfo test KVM: selftests: Fix hex vs. decimal snafu in Xen test KVM: selftests: Fix size of memslots created by Xen tests KVM: selftests: Ignore recently added Xen tests' build output KVM: selftests: Add missing header file needed by xAPIC IPI tests KVM: selftests: Add operand to vmsave/vmload/vmrun in svm.c KVM: SVM: Make symbol 'svm_gp_erratum_intercept' static locking/arch: Move qrwlock.h include after qspinlock.h KVM: PPC: Book3S HV: Fix host radix SLB optimisation with hash guests KVM: PPC: Book3S HV: Ensure radix guest has no SLB entries KVM: PPC: Don't always report hash MMU capability for P9 < DD2.2 KVM: PPC: Book3S HV: Save and restore FSCR in the P9 path KVM: PPC: remove unneeded semicolon KVM: PPC: Book3S HV: Use POWER9 SLBIA IH=6 variant to clear SLB KVM: PPC: Book3S HV: No need to clear radix host SLB before loading HPT guest KVM: PPC: Book3S HV: Fix radix guest SLB side channel KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support KVM: PPC: Book3S HV: Introduce new capability for 2nd DAWR KVM: PPC: Book3S HV: Add infrastructure to support 2nd DAWR ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h127
-rw-r--r--arch/x86/include/asm/kvm_host.h89
-rw-r--r--arch/x86/include/asm/virtext.h25
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/vmxfeatures.h1
-rw-r--r--arch/x86/include/asm/xen/interface.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/apic/apic.c1
-rw-r--r--arch/x86/kernel/reboot.c30
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/cpuid.c24
-rw-r--r--arch/x86/kvm/cpuid.h24
-rw-r--r--arch/x86/kvm/emulate.c14
-rw-r--r--arch/x86/kvm/hyperv.c343
-rw-r--r--arch/x86/kvm/hyperv.h54
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/lapic.c60
-rw-r--r--arch/x86/kvm/lapic.h20
-rw-r--r--arch/x86/kvm/mmu.h8
-rw-r--r--arch/x86/kvm/mmu/mmu.c353
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c8
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h7
-rw-r--r--arch/x86/kvm/mmu/page_track.c8
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/spte.h33
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c46
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c554
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h32
-rw-r--r--arch/x86/kvm/mtrr.c12
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c35
-rw-r--r--arch/x86/kvm/svm/nested.c8
-rw-r--r--arch/x86/kvm/svm/sev.c104
-rw-r--r--arch/x86/kvm/svm/svm.c303
-rw-r--r--arch/x86/kvm/svm/svm.h29
-rw-r--r--arch/x86/kvm/svm/svm_ops.h69
-rw-r--r--arch/x86/kvm/trace.h40
-rw-r--r--arch/x86/kvm/vmx/capabilities.h28
-rw-r--r--arch/x86/kvm/vmx/nested.c106
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c294
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c6
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c282
-rw-r--r--arch/x86/kvm/vmx/vmx.h56
-rw-r--r--arch/x86/kvm/x86.c718
-rw-r--r--arch/x86/kvm/x86.h12
-rw-r--r--arch/x86/kvm/xen.c431
-rw-r--r--arch/x86/kvm/xen.h78
55 files changed, 3171 insertions, 1384 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1feb6c089ba2..cc96e26d69f7 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -292,6 +292,7 @@
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
@@ -335,6 +336,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
new file mode 100644
index 000000000000..355a2ab8fc09
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate
+ * "static_call()"s. They are also intended for use when defining
+ * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those
+ * functions that follow the [svm|vmx]_func_name convention.
+ * KVM_X86_OP_NULL() can leave a NULL definition for the
+ * case where there is no definition or a function name that
+ * doesn't match the typical naming convention is supplied.
+ */
+KVM_X86_OP_NULL(hardware_enable)
+KVM_X86_OP_NULL(hardware_disable)
+KVM_X86_OP_NULL(hardware_unsetup)
+KVM_X86_OP_NULL(cpu_has_accelerated_tpr)
+KVM_X86_OP(has_emulated_msr)
+KVM_X86_OP(vcpu_after_set_cpuid)
+KVM_X86_OP(vm_init)
+KVM_X86_OP_NULL(vm_destroy)
+KVM_X86_OP(vcpu_create)
+KVM_X86_OP(vcpu_free)
+KVM_X86_OP(vcpu_reset)
+KVM_X86_OP(prepare_guest_switch)
+KVM_X86_OP(vcpu_load)
+KVM_X86_OP(vcpu_put)
+KVM_X86_OP(update_exception_bitmap)
+KVM_X86_OP(get_msr)
+KVM_X86_OP(set_msr)
+KVM_X86_OP(get_segment_base)
+KVM_X86_OP(get_segment)
+KVM_X86_OP(get_cpl)
+KVM_X86_OP(set_segment)
+KVM_X86_OP_NULL(get_cs_db_l_bits)
+KVM_X86_OP(set_cr0)
+KVM_X86_OP(is_valid_cr4)
+KVM_X86_OP(set_cr4)
+KVM_X86_OP(set_efer)
+KVM_X86_OP(get_idt)
+KVM_X86_OP(set_idt)
+KVM_X86_OP(get_gdt)
+KVM_X86_OP(set_gdt)
+KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr7)
+KVM_X86_OP(cache_reg)
+KVM_X86_OP(get_rflags)
+KVM_X86_OP(set_rflags)
+KVM_X86_OP(tlb_flush_all)
+KVM_X86_OP(tlb_flush_current)
+KVM_X86_OP_NULL(tlb_remote_flush)
+KVM_X86_OP_NULL(tlb_remote_flush_with_range)
+KVM_X86_OP(tlb_flush_gva)
+KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(run)
+KVM_X86_OP_NULL(handle_exit)
+KVM_X86_OP_NULL(skip_emulated_instruction)
+KVM_X86_OP_NULL(update_emulated_instruction)
+KVM_X86_OP(set_interrupt_shadow)
+KVM_X86_OP(get_interrupt_shadow)
+KVM_X86_OP(patch_hypercall)
+KVM_X86_OP(set_irq)
+KVM_X86_OP(set_nmi)
+KVM_X86_OP(queue_exception)
+KVM_X86_OP(cancel_injection)
+KVM_X86_OP(interrupt_allowed)
+KVM_X86_OP(nmi_allowed)
+KVM_X86_OP(get_nmi_mask)
+KVM_X86_OP(set_nmi_mask)
+KVM_X86_OP(enable_nmi_window)
+KVM_X86_OP(enable_irq_window)
+KVM_X86_OP(update_cr8_intercept)
+KVM_X86_OP(check_apicv_inhibit_reasons)
+KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
+KVM_X86_OP(refresh_apicv_exec_ctrl)
+KVM_X86_OP(hwapic_irr_update)
+KVM_X86_OP(hwapic_isr_update)
+KVM_X86_OP_NULL(guest_apic_has_interrupt)
+KVM_X86_OP(load_eoi_exitmap)
+KVM_X86_OP(set_virtual_apic_mode)
+KVM_X86_OP_NULL(set_apic_access_page_addr)
+KVM_X86_OP(deliver_posted_interrupt)
+KVM_X86_OP_NULL(sync_pir_to_irr)
+KVM_X86_OP(set_tss_addr)
+KVM_X86_OP(set_identity_map_addr)
+KVM_X86_OP(get_mt_mask)
+KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_NULL(has_wbinvd_exit)
+KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(get_exit_info)
+KVM_X86_OP(check_intercept)
+KVM_X86_OP(handle_exit_irqoff)
+KVM_X86_OP_NULL(request_immediate_exit)
+KVM_X86_OP(sched_in)
+KVM_X86_OP_NULL(slot_enable_log_dirty)
+KVM_X86_OP_NULL(slot_disable_log_dirty)
+KVM_X86_OP_NULL(flush_log_dirty)
+KVM_X86_OP_NULL(enable_log_dirty_pt_masked)
+KVM_X86_OP_NULL(cpu_dirty_log_size)
+KVM_X86_OP_NULL(pre_block)
+KVM_X86_OP_NULL(post_block)
+KVM_X86_OP_NULL(vcpu_blocking)
+KVM_X86_OP_NULL(vcpu_unblocking)
+KVM_X86_OP_NULL(update_pi_irte)
+KVM_X86_OP_NULL(apicv_post_state_restore)
+KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_NULL(set_hv_timer)
+KVM_X86_OP_NULL(cancel_hv_timer)
+KVM_X86_OP(setup_mce)
+KVM_X86_OP(smi_allowed)
+KVM_X86_OP(pre_enter_smm)
+KVM_X86_OP(pre_leave_smm)
+KVM_X86_OP(enable_smi_window)
+KVM_X86_OP_NULL(mem_enc_op)
+KVM_X86_OP_NULL(mem_enc_reg_region)
+KVM_X86_OP_NULL(mem_enc_unreg_region)
+KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(can_emulate_instruction)
+KVM_X86_OP(apic_init_signal_blocked)
+KVM_X86_OP_NULL(enable_direct_tlbflush)
+KVM_X86_OP_NULL(migrate_timers)
+KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP_NULL(complete_emulated_msr)
+
+#undef KVM_X86_OP
+#undef KVM_X86_OP_NULL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..84499aad01a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,10 +40,8 @@
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
#define KVM_MAX_VCPU_ID 1023
-#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
#define KVM_HALT_POLL_NS_DEFAULT 200000
@@ -52,6 +50,9 @@
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
+#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
+ KVM_BUS_LOCK_DETECTION_EXIT)
+
/* x86-specific vcpu->requests bit members */
#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
@@ -200,9 +201,17 @@ enum x86_intercept_stage;
#define DR6_BS (1 << 14)
#define DR6_BT (1 << 15)
#define DR6_RTM (1 << 16)
-#define DR6_FIXED_1 0xfffe0ff0
-#define DR6_INIT 0xffff0ff0
+/*
+ * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
+ * We can regard all the bits in DR6_FIXED_1 as active_low bits;
+ * they will never be 0 for now, but when they are defined
+ * in the future it will require no code change.
+ *
+ * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
+ */
+#define DR6_ACTIVE_LOW 0xffff0ff0
#define DR6_VOLATILE 0x0001e00f
+#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
@@ -337,6 +346,8 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+#define KVM_HAVE_MMU_RWLOCK
+
struct kvm_mmu_page;
/*
@@ -358,8 +369,6 @@ struct kvm_mmu {
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
- void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte);
hpa_t root_hpa;
gpa_t root_pgd;
union kvm_mmu_role mmu_role;
@@ -510,6 +519,7 @@ struct kvm_vcpu_hv_synic {
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
+ struct kvm_vcpu *vcpu;
u32 vp_index;
u64 hv_vapic;
s64 runtime_offset;
@@ -520,6 +530,15 @@ struct kvm_vcpu_hv {
cpumask_t tlb_flush;
};
+/* Xen HVM per vcpu emulation context */
+struct kvm_vcpu_xen {
+ u64 hypercall_rip;
+ bool vcpu_info_set;
+ bool vcpu_time_info_set;
+ struct gfn_to_hva_cache vcpu_info_cache;
+ struct gfn_to_hva_cache vcpu_time_info_cache;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -640,7 +659,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
- unsigned long cr3_lm_rsvd_bits;
+ u64 reserved_gpa_bits;
int maxphyaddr;
int max_tdp_level;
@@ -717,7 +736,9 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
- struct kvm_vcpu_hv hyperv;
+ bool hyperv_enabled;
+ struct kvm_vcpu_hv *hyperv;
+ struct kvm_vcpu_xen xen;
cpumask_var_t wbinvd_dirty_mask;
@@ -888,6 +909,14 @@ struct msr_bitmap_range {
unsigned long *bitmap;
};
+/* Xen emulation context */
+struct kvm_xen {
+ bool long_mode;
+ bool shinfo_set;
+ u8 upcall_vector;
+ struct gfn_to_hva_cache shinfo_cache;
+};
+
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
@@ -967,6 +996,7 @@ struct kvm_arch {
struct hlist_head mask_notifier_list;
struct kvm_hv hyperv;
+ struct kvm_xen xen;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
@@ -998,9 +1028,12 @@ struct kvm_arch {
struct msr_bitmap_range ranges[16];
} msr_filter;
+ bool bus_lock_detection_enabled;
+
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+#ifdef CONFIG_X86_64
/*
* Whether the TDP MMU is enabled for this VM. This contains a
* snapshot of the TDP MMU module parameter from when the VM was
@@ -1026,12 +1059,25 @@ struct kvm_arch {
* tdp_mmu_page set and a root_count of 0.
*/
struct list_head tdp_mmu_pages;
+
+ /*
+ * Protects accesses to the following fields when the MMU lock
+ * is held in read mode:
+ * - tdp_mmu_pages (above)
+ * - the link field of struct kvm_mmu_pages used by the TDP MMU
+ * - lpage_disallowed_mmu_pages
+ * - the lpage_disallowed_link field of struct kvm_mmu_pages used
+ * by the TDP MMU
+ * It is acceptable, but not necessary, to acquire this lock when
+ * the thread holds the MMU lock in write mode.
+ */
+ spinlock_t tdp_mmu_pages_lock;
+#endif /* CONFIG_X86_64 */
};
struct kvm_vm_stat {
ulong mmu_shadow_zapped;
ulong mmu_pte_write;
- ulong mmu_pte_updated;
ulong mmu_pde_zapped;
ulong mmu_flooded;
ulong mmu_recycled;
@@ -1340,6 +1386,19 @@ extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern struct kvm_x86_ops kvm_x86_ops;
+#define KVM_X86_OP(func) \
+ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+
+static inline void kvm_ops_static_call_update(void)
+{
+#define KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+}
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1351,7 +1410,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
{
if (kvm_x86_ops.tlb_remote_flush &&
- !kvm_x86_ops.tlb_remote_flush(kvm))
+ !static_call(kvm_x86_tlb_remote_flush)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1421,6 +1480,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits;
extern u64 kvm_max_tsc_scaling_ratio;
/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
extern u64 kvm_default_tsc_scaling_ratio;
+/* bus lock detection supported? */
+extern bool kvm_has_bus_lock_exit;
extern u64 kvm_mce_cap_supported;
@@ -1501,7 +1562,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -1742,14 +1803,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops.vcpu_blocking)
- kvm_x86_ops.vcpu_blocking(vcpu);
+ static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops.vcpu_unblocking)
- kvm_x86_ops.vcpu_unblocking(vcpu);
+ static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 9aad0e0876fb..8757078d4442 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -30,16 +30,29 @@ static inline int cpu_has_vmx(void)
}
-/** Disable VMX on the current CPU
+/**
+ * cpu_vmxoff() - Disable VMX on the current CPU
*
- * vmxoff causes a undefined-opcode exception if vmxon was not run
- * on the CPU previously. Only call this function if you know VMX
- * is enabled.
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
*/
-static inline void cpu_vmxoff(void)
+static inline int cpu_vmxoff(void)
{
- asm volatile ("vmxoff");
+ asm_volatile_goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
}
static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 38ca445a8429..358707f60d99 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -73,6 +73,7 @@
#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
#define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING)
#define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING)
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index 9915990fd8cf..d9a74681a77d 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -83,5 +83,6 @@
#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */
#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
+#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */
#endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 9139b3e86316..baca0b00ef76 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -182,6 +182,9 @@ struct arch_shared_info {
unsigned long p2m_cr3; /* cr3 value of the p2m address space */
unsigned long p2m_vaddr; /* virtual address of the p2m list */
unsigned long p2m_generation; /* generation count of p2m mapping */
+#ifdef CONFIG_X86_32
+ uint32_t wc_sec_hi;
+#endif
};
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 8e76d3701db3..5a3022c8af82 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -112,6 +112,7 @@ struct kvm_ioapic_state {
#define KVM_NR_IRQCHIPS 3
#define KVM_RUN_X86_SMM (1 << 0)
+#define KVM_RUN_X86_BUS_LOCK (1 << 1)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index ada955c5ebb6..b8e650a985e3 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -89,6 +89,7 @@
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_UMWAIT 67
#define EXIT_REASON_TPAUSE 68
+#define EXIT_REASON_BUS_LOCK 74
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -150,7 +151,8 @@
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
- { EXIT_REASON_TPAUSE, "TPAUSE" }
+ { EXIT_REASON_TPAUSE, "TPAUSE" }, \
+ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7f4c081f59f0..819db00c9388 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1747,6 +1747,7 @@ void apic_ap_setup(void)
#ifdef CONFIG_X86_X2APIC
int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
enum {
X2APIC_OFF,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9991c5920aac..b29657b76e3f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -547,31 +547,21 @@ static void emergency_vmx_disable_all(void)
local_irq_disable();
/*
- * We need to disable VMX on all CPUs before rebooting, otherwise
- * we risk hanging up the machine, because the CPU ignores INIT
- * signals when VMX is enabled.
+ * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
+ * the machine, because the CPU blocks INIT when it's in VMX root.
*
- * We can't take any locks and we may be on an inconsistent
- * state, so we use NMIs as IPIs to tell the other CPUs to disable
- * VMX and halt.
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
*
- * For safety, we will avoid running the nmi_shootdown_cpus()
- * stuff unnecessarily, but we don't have a way to check
- * if other CPUs have VMX enabled. So we will call it only if the
- * CPU we are running on has VMX enabled.
- *
- * We will miss cases where VMX is not enabled on all CPUs. This
- * shouldn't do much harm because KVM always enable VMX on all
- * CPUs anyway. But we can miss it on the small window where KVM
- * is still enabling VMX.
+ * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
+ * doesn't prevent a different CPU from being in VMX root operation.
*/
- if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU. */
- cpu_vmxoff();
+ if (cpu_has_vmx()) {
+ /* Safely force _this_ CPU out of VMX root operation. */
+ __cpu_emergency_vmxoff();
- /* Halt and disable VMX on the other CPUs */
+ /* Halt and exit VMX root operation on the other CPUs. */
nmi_shootdown_cpus(vmxoff_nmi);
-
}
}
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4bd14ab01323..aeab168c5711 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,10 +14,11 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/dirty_ring.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
+kvm-y += x86.o emulate.o i8259.o irq.o lapic.o xen.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o
+ mmu/spte.o
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 38172ca627d3..c8f2592ccc99 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -173,16 +173,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- kvm_mmu_reset_context(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
kvm_pmu_refresh(vcpu);
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
- vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+ kvm_hv_set_cpuid(vcpu);
/* Invoke the vendor callback only after the above state is updated. */
- kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
+ static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+
+ /*
+ * Except for the MMU, which needs to be reset after any vendor
+ * specific adjustments to the reserved GPA bits.
+ */
+ kvm_mmu_reset_context(vcpu);
}
static int is_efer_nx(void)
@@ -223,6 +229,16 @@ not_found:
return 36;
}
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits. The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -434,7 +450,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX512_BF16)
+ F(AVX_VNNI) | F(AVX512_BF16)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dc921d76e42e..2a0c5064497f 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -30,15 +30,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
return vcpu->arch.maxphyaddr;
}
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+ return !kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t alignment)
+{
+ return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
}
struct cpuid_reg {
@@ -324,11 +341,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
kvm_cpu_cap_set(x86_feature);
}
-static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
-}
-
static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
unsigned int kvm_feature)
{
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66a08322988f..f7970ba6219f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2506,12 +2506,12 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
val = GET_SMSTATE(u32, smstate, 0x7fcc);
- if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 6, val))
return X86EMUL_UNHANDLEABLE;
val = GET_SMSTATE(u32, smstate, 0x7fc8);
- if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 7, val))
return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
@@ -2564,14 +2564,14 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
- val = GET_SMSTATE(u32, smstate, 0x7f68);
+ val = GET_SMSTATE(u64, smstate, 0x7f68);
- if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 6, val))
return X86EMUL_UNHANDLEABLE;
- val = GET_SMSTATE(u32, smstate, 0x7f60);
+ val = GET_SMSTATE(u64, smstate, 0x7f60);
- if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 7, val))
return X86EMUL_UNHANDLEABLE;
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
@@ -4329,7 +4329,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_dr(ctxt, 6, &dr6);
dr6 &= ~DR_TRAP_BITS;
- dr6 |= DR6_BD | DR6_RTM;
+ dr6 |= DR6_BD | DR6_ACTIVE_LOW;
ctxt->ops->set_dr(ctxt, 6, dr6);
return emulate_db(ctxt);
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 922c69dcca4d..7d2dae92d638 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,6 +23,7 @@
#include "ioapic.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "xen.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h>
@@ -36,6 +37,9 @@
#include "trace.h"
#include "irq.h"
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -128,7 +132,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
synic_update_vector(synic, vector);
/* Load SynIC vectors into EOI exit bitmap */
- kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
return 0;
}
@@ -141,10 +145,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
return NULL;
vcpu = kvm_get_vcpu(kvm, vpidx);
- if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ if (kvm_hv_get_vpindex(vcpu) == vpidx)
return vcpu;
return NULL;
}
@@ -157,15 +161,15 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
vcpu = get_vcpu_by_vpidx(kvm, vpidx);
if (!vcpu)
return NULL;
- synic = vcpu_to_synic(vcpu);
+ synic = to_hv_synic(vcpu);
return (synic->active) ? synic : NULL;
}
static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_stimer *stimer;
int gsi, idx;
@@ -189,8 +193,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
hv_vcpu->exit.u.synic.msr = msr;
@@ -204,7 +208,7 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
u32 msr, u64 data, bool host)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int ret;
if (!synic->active && !host)
@@ -282,8 +286,7 @@ static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
hv->hv_syndbg.control.status =
@@ -293,8 +296,8 @@ static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
hv_vcpu->exit.u.syndbg.msr = msr;
@@ -310,13 +313,13 @@ static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
return 1;
trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
- vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data);
+ to_hv_vcpu(vcpu)->vp_index, msr, data);
switch (msr) {
case HV_X64_MSR_SYNDBG_CONTROL:
syndbg->control.control = data;
@@ -349,7 +352,7 @@ static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
return 1;
@@ -377,9 +380,7 @@ static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
break;
}
- trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id,
- vcpu_to_hv_vcpu(vcpu)->vp_index, msr,
- *pdata);
+ trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
return 0;
}
@@ -421,7 +422,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
struct kvm_lapic_irq irq;
int ret, vector;
@@ -457,7 +458,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
{
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
int i;
trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
@@ -514,7 +515,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
static u64 get_time_ref_counter(struct kvm *kvm)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
u64 tsc;
@@ -534,10 +535,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
set_bit(stimer->index,
- vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
if (vcpu_kick)
kvm_vcpu_kick(vcpu);
@@ -545,14 +546,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
- trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index);
hrtimer_cancel(&stimer->timer);
clear_bit(stimer->index,
- vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
stimer->msg_pending = false;
stimer->exp_time = 0;
}
@@ -562,7 +563,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
struct kvm_vcpu_hv_stimer *stimer;
stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
- trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index);
stimer_mark_pending(stimer, true);
@@ -579,7 +580,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
u64 time_now;
ktime_t ktime_now;
- time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
ktime_now = ktime_get();
if (stimer->config.periodic) {
@@ -596,7 +597,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
stimer->exp_time = time_now + stimer->count;
trace_kvm_hv_stimer_start_periodic(
- stimer_to_vcpu(stimer)->vcpu_id,
+ hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index,
time_now, stimer->exp_time);
@@ -618,7 +619,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
return 0;
}
- trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index,
time_now, stimer->count);
@@ -633,13 +634,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
{
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
- trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
stimer_cleanup(stimer);
@@ -657,13 +658,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
- trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
stimer_cleanup(stimer);
@@ -694,7 +695,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
struct hv_message *src_msg, bool no_retry)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
gfn_t msg_page_gfn;
struct hv_message_header hv_hdr;
@@ -750,7 +751,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct hv_message *msg = &stimer->msg;
struct hv_timer_message_payload *payload =
(struct hv_timer_message_payload *)&msg->u.payload;
@@ -763,14 +764,14 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
payload->expiration_time = stimer->exp_time;
payload->delivery_time = get_time_ref_counter(vcpu->kvm);
- return synic_deliver_msg(vcpu_to_synic(vcpu),
+ return synic_deliver_msg(to_hv_synic(vcpu),
stimer->config.sintx, msg,
no_retry);
}
static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct kvm_lapic_irq irq = {
.delivery_mode = APIC_DM_FIXED,
.vector = stimer->config.apic_vector
@@ -790,7 +791,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
r = stimer_send_msg(stimer);
else
r = stimer_notify_direct(stimer);
- trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, direct, r);
if (!r) {
stimer->msg_pending = false;
@@ -801,11 +802,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_stimer *stimer;
u64 time_now, exp_time;
int i;
+ if (!hv_vcpu)
+ return;
+
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
stimer = &hv_vcpu->stimer[i];
@@ -831,16 +835,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
int i;
+ if (!hv_vcpu)
+ return;
+
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_cleanup(&hv_vcpu->stimer[i]);
+
+ kfree(hv_vcpu);
+ vcpu->arch.hyperv = NULL;
}
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
return false;
return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
}
@@ -880,28 +895,41 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
stimer_prepare_msg(stimer);
}
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu;
int i;
+ hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+ if (!hv_vcpu)
+ return -ENOMEM;
+
+ vcpu->arch.hyperv = hv_vcpu;
+ hv_vcpu->vcpu = vcpu;
+
synic_init(&hv_vcpu->synic);
bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_init(&hv_vcpu->stimer[i], i);
-}
-
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+
+ return 0;
}
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
{
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv_synic *synic;
+ int r;
+
+ if (!to_hv_vcpu(vcpu)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+ }
+
+ synic = to_hv_synic(vcpu);
/*
* Hyper-V SynIC auto EOI SINT's are
@@ -939,10 +967,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
return r;
}
-static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
- u32 index, u64 *pdata)
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
size_t size = ARRAY_SIZE(hv->hv_crash_param);
if (WARN_ON_ONCE(index >= size))
@@ -952,41 +979,26 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
return 0;
}
-static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
*pdata = hv->hv_crash_ctl;
return 0;
}
-static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
-
- if (host)
- hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
- if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) {
-
- vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
- hv->hv_crash_param[0],
- hv->hv_crash_param[1],
- hv->hv_crash_param[2],
- hv->hv_crash_param[3],
- hv->hv_crash_param[4]);
-
- /* Send notification about crash to user space */
- kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
- }
+ hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
return 0;
}
-static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
- u32 index, u64 data)
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
size_t size = ARRAY_SIZE(hv->hv_crash_param);
if (WARN_ON_ONCE(index >= size))
@@ -1068,7 +1080,7 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
u32 tsc_seq;
u64 gfn;
@@ -1078,7 +1090,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
return;
- mutex_lock(&kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;
@@ -1122,14 +1134,14 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
out_unlock:
- mutex_unlock(&kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
}
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
@@ -1139,9 +1151,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
break;
case HV_X64_MSR_HYPERCALL: {
- u64 gfn;
- unsigned long addr;
- u8 instructions[4];
+ u8 instructions[9];
+ int i = 0;
+ u64 addr;
/* if guest os id is not set hypercall should remain disabled */
if (!hv->hv_guest_os_id)
@@ -1150,16 +1162,33 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_hypercall = data;
break;
}
- gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return 1;
- kvm_x86_ops.patch_hypercall(vcpu, instructions);
- ((unsigned char *)instructions)[3] = 0xc3; /* ret */
- if (__copy_to_user((void __user *)addr, instructions, 4))
+
+ /*
+ * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+ * the same way Xen itself does, by setting the bit 31 of EAX
+ * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+ * going to be clobbered on 64-bit.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ /* orl $0x80000000, %eax */
+ instructions[i++] = 0x0d;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x80;
+ }
+
+ /* vmcall/vmmcall */
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ i += 3;
+
+ /* ret */
+ ((unsigned char *)instructions)[i++] = 0xc3;
+
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
return 1;
hv->hv_hypercall = data;
- mark_page_dirty(kvm, gfn);
break;
}
case HV_X64_MSR_REFERENCE_TSC:
@@ -1168,11 +1197,25 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- return kvm_hv_msr_set_crash_data(vcpu,
+ return kvm_hv_msr_set_crash_data(kvm,
msr - HV_X64_MSR_CRASH_P0,
data);
case HV_X64_MSR_CRASH_CTL:
- return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ if (host)
+ return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+ if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+ break;
case HV_X64_MSR_RESET:
if (data == 1) {
vcpu_debug(vcpu, "hyper-v reset requested\n");
@@ -1216,11 +1259,11 @@ static u64 current_task_runtime_100ns(void)
static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
int vcpu_idx = kvm_vcpu_get_idx(vcpu);
u32 new_vp_index = (u32)data;
@@ -1291,14 +1334,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
case HV_X64_MSR_STIMER3_CONFIG: {
int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
- return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_set_config(to_hv_stimer(vcpu, timer_index),
data, host);
}
case HV_X64_MSR_STIMER0_COUNT:
@@ -1307,7 +1350,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_STIMER3_COUNT: {
int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
- return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_set_count(to_hv_stimer(vcpu, timer_index),
data, host);
}
case HV_X64_MSR_TSC_FREQUENCY:
@@ -1330,7 +1373,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
{
u64 data = 0;
struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
@@ -1346,11 +1389,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = hv->hv_tsc_page;
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- return kvm_hv_msr_get_crash_data(vcpu,
+ return kvm_hv_msr_get_crash_data(kvm,
msr - HV_X64_MSR_CRASH_P0,
pdata);
case HV_X64_MSR_CRASH_CTL:
- return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ return kvm_hv_msr_get_crash_ctl(kvm, pdata);
case HV_X64_MSR_RESET:
data = 0;
break;
@@ -1379,7 +1422,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
bool host)
{
u64 data = 0;
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
switch (msr) {
case HV_X64_MSR_VP_INDEX:
@@ -1403,14 +1446,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
+ return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
case HV_X64_MSR_STIMER3_CONFIG: {
int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
- return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_get_config(to_hv_stimer(vcpu, timer_index),
pdata);
}
case HV_X64_MSR_STIMER0_COUNT:
@@ -1419,7 +1462,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_STIMER3_COUNT: {
int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
- return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_get_count(to_hv_stimer(vcpu, timer_index),
pdata);
}
case HV_X64_MSR_TSC_FREQUENCY:
@@ -1438,12 +1481,22 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+ }
+
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
- mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
return r;
} else
return kvm_hv_set_msr(vcpu, msr, data, host);
@@ -1451,12 +1504,22 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+ }
+
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
- mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
return r;
} else
return kvm_hv_get_msr(vcpu, msr, pdata, host);
@@ -1466,7 +1529,7 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
u64 *vp_bitmap, unsigned long *vcpu_bitmap)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
int i, bank, sbank = 0;
@@ -1483,18 +1546,16 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
- (unsigned long *)vp_bitmap))
+ if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
__set_bit(i, vcpu_bitmap);
}
return vcpu_bitmap;
}
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
- u16 rep_cnt, bool ex)
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex)
{
- struct kvm *kvm = current_vcpu->kvm;
- struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1592,10 +1653,10 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
bool ex, bool fast)
{
- struct kvm *kvm = current_vcpu->kvm;
+ struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1666,9 +1727,20 @@ ret_success:
return HV_STATUS_SUCCESS;
}
-bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
{
- return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0;
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
+ if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX)
+ vcpu->arch.hyperv_enabled = true;
+ else
+ vcpu->arch.hyperv_enabled = false;
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
}
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
@@ -1698,6 +1770,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
struct eventfd_ctx *eventfd;
if (unlikely(!fast)) {
@@ -1726,7 +1799,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
rcu_read_lock();
- eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ eventfd = idr_find(&hv->conn_to_evt, param);
rcu_read_unlock();
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
@@ -1745,7 +1818,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
- if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -1793,7 +1866,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+ if (unlikely(rep || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -1855,7 +1928,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
fallthrough;
case HVCALL_RESET_DEBUG_SESSION: {
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu)) {
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
@@ -1885,23 +1958,26 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
void kvm_hv_init_vm(struct kvm *kvm)
{
- mutex_init(&kvm->arch.hyperv.hv_lock);
- idr_init(&kvm->arch.hyperv.conn_to_evt);
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_init(&hv->hv_lock);
+ idr_init(&hv->conn_to_evt);
}
void kvm_hv_destroy_vm(struct kvm *kvm)
{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
int i;
- idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
eventfd_ctx_put(eventfd);
- idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+ idr_destroy(&hv->conn_to_evt);
}
static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
int ret;
@@ -1925,7 +2001,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
mutex_lock(&hv->hv_lock);
@@ -1997,8 +2073,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
break;
case HYPERV_CPUID_INTERFACE:
- memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
- ent->eax = signature[0];
+ ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
break;
case HYPERV_CPUID_VERSION:
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 6d7def2b0aad..e951af1fcb2c 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -50,38 +50,46 @@
/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2)
-static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
{
- return &vcpu->arch.hyperv;
+ return &kvm->arch.hyperv;
}
-static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_arch *arch;
-
- arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
- return container_of(arch, struct kvm_vcpu, arch);
+ return vcpu->arch.hyperv;
}
-static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
{
- return &vcpu->arch.hyperv.synic;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return &hv_vcpu->synic;
}
-static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
{
- return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+ struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+ return hv_vcpu->vcpu;
}
-static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
{
return &vcpu->kvm->arch.hyperv.hv_syndbg;
}
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+}
+
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
@@ -89,32 +97,35 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
struct hv_vp_assist_page *assist_page);
-static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
- int timer_index)
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
{
- return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+ return &to_hv_vcpu(vcpu)->stimer[timer_index];
}
-static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
{
struct kvm_vcpu_hv *hv_vcpu;
hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
stimer[0]);
- return hv_vcpu_to_vcpu(hv_vcpu);
+ return hv_vcpu->vcpu;
}
static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
{
- return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
HV_SYNIC_STIMER_COUNT);
}
@@ -125,6 +136,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 814698e5b152..172b05343cfd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -14,6 +14,7 @@
#include "irq.h"
#include "i8254.h"
#include "x86.h"
+#include "xen.h"
/*
* check if there are pending timer events
@@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.injected;
+ if (kvm_xen_has_interrupt(v))
+ return 1;
+
if (!kvm_apic_accept_pic_intr(v))
return 0;
@@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
+ if (kvm_xen_has_interrupt(v))
+ return v->kvm->arch.xen.upcall_vector;
+
if (irqchip_split(v->kvm)) {
int vector = v->arch.pending_external_vector;
@@ -143,8 +150,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
- if (kvm_x86_ops.migrate_timers)
- kvm_x86_ops.migrate_timers(vcpu);
+ static_call_cond(kvm_x86_migrate_timers)(vcpu);
}
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a889563ad02d..2e11da2f5621 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -68,7 +68,7 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
return 0;
if (!kvm_register_is_available(vcpu, reg))
- kvm_x86_ops.cache_reg(vcpu, reg);
+ static_call(kvm_x86_cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
@@ -108,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
might_sleep(); /* on svm */
if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -118,7 +118,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -132,14 +132,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 43c93ffa76ed..0d359115429a 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -205,7 +205,7 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
- int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43cceadd073e..45d40bfacb7c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -91,8 +91,8 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-struct static_key_deferred apic_hw_disabled __read_mostly;
-struct static_key_deferred apic_sw_disabled __read_mostly;
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
static inline int apic_enabled(struct kvm_lapic *apic)
{
@@ -290,9 +290,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
if (enabled != apic->sw_enabled) {
apic->sw_enabled = enabled;
if (enabled)
- static_key_slow_dec_deferred(&apic_sw_disabled);
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
else
- static_key_slow_inc(&apic_sw_disabled.key);
+ static_branch_inc(&apic_sw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
@@ -484,7 +484,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- kvm_x86_ops.hwapic_irr_update(vcpu,
+ static_call(kvm_x86_hwapic_irr_update)(vcpu,
apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
@@ -515,7 +515,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- kvm_x86_ops.hwapic_isr_update(vcpu, vec);
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -563,8 +563,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- kvm_x86_ops.hwapic_isr_update(vcpu,
- apic_find_highest_isr(apic));
+ static_call(kvm_x86_hwapic_isr_update)(vcpu,
+ apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -701,7 +701,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (apic->vcpu->arch.apicv_active)
- highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu);
+ highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1090,7 +1090,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) {
+ if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
@@ -1245,7 +1245,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ if (to_hv_vcpu(apic->vcpu) &&
+ test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap))
kvm_hv_synic_send_eoi(apic->vcpu, vector);
kvm_ioapic_send_eoi(apic, vector);
@@ -1814,7 +1815,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
{
WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- kvm_x86_ops.cancel_hv_timer(apic->vcpu);
+ static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
@@ -1831,7 +1832,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!ktimer->tscdeadline)
return false;
- if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired))
+ if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
@@ -2175,10 +2176,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
hrtimer_cancel(&apic->lapic_timer.timer);
if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
- static_key_slow_dec_deferred(&apic_hw_disabled);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
if (!apic->sw_enabled)
- static_key_slow_dec_deferred(&apic_sw_disabled);
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
if (apic->regs)
free_page((unsigned long)apic->regs);
@@ -2250,9 +2251,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
- static_key_slow_dec_deferred(&apic_hw_disabled);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
} else {
- static_key_slow_inc(&apic_hw_disabled.key);
+ static_branch_inc(&apic_hw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
}
@@ -2261,7 +2262,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- kvm_x86_ops.set_virtual_apic_mode(vcpu);
+ static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2338,9 +2339,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- kvm_x86_ops.apicv_post_state_restore(vcpu);
- kvm_x86_ops.hwapic_irr_update(vcpu, -1);
- kvm_x86_ops.hwapic_isr_update(vcpu, -1);
+ static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2449,7 +2450,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
* thinking that APIC state has changed.
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
- static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
@@ -2512,7 +2513,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
*/
apic_clear_irr(vector, apic);
- if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) {
/*
* For auto-EOI interrupts, there might be another pending
* interrupt above PPR, so check whether to raise another
@@ -2601,10 +2602,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- kvm_x86_ops.apicv_post_state_restore(vcpu);
- kvm_x86_ops.hwapic_irr_update(vcpu,
+ static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call(kvm_x86_hwapic_irr_update)(vcpu,
apic_find_highest_irr(apic));
- kvm_x86_ops.hwapic_isr_update(vcpu,
+ static_call(kvm_x86_hwapic_isr_update)(vcpu,
apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -2904,13 +2905,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
}
}
-void kvm_lapic_init(void)
-{
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&apic_hw_disabled, HZ);
- jump_label_rate_limit(&apic_sw_disabled, HZ);
-}
-
void kvm_lapic_exit(void)
{
static_key_deferred_flush(&apic_hw_disabled);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4fb86e3a9dd3..997c45a5963a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
+#include "hyperv.h"
+
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
#define KVM_APIC_LVT_NUM 6
@@ -125,13 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-}
-
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
-void kvm_lapic_init(void);
void kvm_lapic_exit(void);
#define VEC_POS(v) ((v) & (32 - 1))
@@ -172,29 +168,29 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va
__kvm_lapic_set_reg(apic->regs, reg_off, val);
}
-extern struct static_key kvm_no_apic_vcpu;
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
- if (static_key_false(&kvm_no_apic_vcpu))
+ if (static_branch_unlikely(&kvm_has_noapic_vcpu))
return vcpu->arch.apic;
return true;
}
-extern struct static_key_deferred apic_hw_disabled;
+extern struct static_key_false_deferred apic_hw_disabled;
static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
{
- if (static_key_false(&apic_hw_disabled.key))
+ if (static_branch_unlikely(&apic_hw_disabled.key))
return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
return MSR_IA32_APICBASE_ENABLE;
}
-extern struct static_key_deferred apic_sw_disabled;
+extern struct static_key_false_deferred apic_sw_disabled;
static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
{
- if (static_key_false(&apic_sw_disabled.key))
+ if (static_branch_unlikely(&apic_sw_disabled.key))
return apic->sw_enabled;
return true;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 261be1d2032b..c68bfc3e2402 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -102,7 +102,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
vcpu->arch.mmu->shadow_root_level);
}
@@ -152,7 +152,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*
* TODO: introduce APIs to split these two cases.
*/
-static inline int is_writable_pte(unsigned long pte)
+static inline bool is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
@@ -174,8 +174,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned pte_access, unsigned pte_pkey,
unsigned pfec)
{
- int cpl = kvm_x86_ops.get_cpl(vcpu);
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ int cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
/*
* If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..e507568cd55d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -190,7 +190,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
int ret = -ENOTSUPP;
if (range && kvm_x86_ops.tlb_remote_flush_with_range)
- ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
+ ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
if (ret)
kvm_flush_remote_tlbs(kvm);
@@ -844,17 +844,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
int i, count = 0;
if (!rmap_head->val) {
- rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+ rmap_printk("%p %llx 0->1\n", spte, *spte);
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
- rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+ rmap_printk("%p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
- rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+ rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
@@ -906,14 +906,14 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
pr_err("%s: %p 0->BUG\n", __func__, spte);
BUG();
} else if (!(rmap_head->val & 1)) {
- rmap_printk("%s: %p 1->0\n", __func__, spte);
+ rmap_printk("%p 1->0\n", spte);
if ((u64 *)rmap_head->val != spte) {
pr_err("%s: %p 1->BUG\n", __func__, spte);
BUG();
}
rmap_head->val = 0;
} else {
- rmap_printk("%s: %p many->many\n", __func__, spte);
+ rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -1115,7 +1115,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
return false;
- rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
@@ -1142,7 +1142,7 @@ static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
@@ -1184,7 +1184,7 @@ static bool spte_set_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
/*
* Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
@@ -1225,7 +1225,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
while (mask) {
@@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
while (mask) {
@@ -1283,8 +1283,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask)
{
if (kvm_x86_ops.enable_log_dirty_pt_masked)
- kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
- mask);
+ static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot,
+ gfn_offset,
+ mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
@@ -1292,7 +1293,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
int kvm_cpu_dirty_log_size(void)
{
if (kvm_x86_ops.cpu_dirty_log_size)
- return kvm_x86_ops.cpu_dirty_log_size();
+ return static_call(kvm_x86_cpu_dirty_log_size)();
return 0;
}
@@ -1309,7 +1310,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
@@ -1331,7 +1332,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
while ((sptep = rmap_get_first(rmap_head, &iter))) {
- rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+ rmap_printk("spte %p %llx.\n", sptep, *sptep);
pte_list_remove(rmap_head, sptep);
flush = true;
@@ -1363,7 +1364,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1456,16 +1457,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static int kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+static __always_inline int
+kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
+ unsigned long data))
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -1521,7 +1523,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
return r;
@@ -1533,7 +1535,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
return r;
@@ -1588,7 +1590,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int young = false;
young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
return young;
@@ -1599,7 +1601,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
int young = false;
young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
return young;
@@ -1723,13 +1725,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
return 0;
}
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *pte)
-{
- WARN_ON(1);
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -2016,9 +2011,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
- if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
+ cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
flush = false;
}
}
@@ -2417,7 +2412,7 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
return 0;
restart:
- list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
/*
* Don't zap active root pages, the page itself can't be freed
* and zapping it will just force vCPUs to realloc and reload.
@@ -2470,7 +2465,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
@@ -2481,7 +2476,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2492,7 +2487,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
@@ -2500,7 +2495,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return r;
}
@@ -3161,7 +3156,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
if (kvm_mmu_put_root(kvm, sp)) {
- if (sp->tdp_mmu_page)
+ if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_free_root(kvm, sp);
else if (sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3192,7 +3187,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return;
}
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3215,7 +3210,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3236,16 +3231,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
struct kvm_mmu_page *sp;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu)) {
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return INVALID_PAGE;
}
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
@@ -3255,7 +3250,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
hpa_t root;
unsigned i;
- if (vcpu->kvm->arch.tdp_mmu_enabled) {
+ if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
if (!VALID_PAGE(root))
@@ -3416,17 +3411,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
!smp_load_acquire(&sp->unsync_children))
return;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
mmu_sync_children(vcpu, sp);
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
@@ -3440,7 +3435,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
@@ -3724,7 +3719,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
return r;
r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ read_lock(&vcpu->kvm->mmu_lock);
+ else
+ write_lock(&vcpu->kvm->mmu_lock);
+
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
@@ -3739,7 +3739,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
prefault, is_tdp);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ read_unlock(&vcpu->kvm->mmu_lock);
+ else
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return r;
}
@@ -3813,7 +3816,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = true;
@@ -3984,20 +3986,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
static void
__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, int level, bool nx, bool gbpages,
+ u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
bool pse, bool amd)
{
- u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
+ u64 high_bits_rsvd;
rsvd_check->bad_mt_xwr = 0;
- if (!nx)
- exb_bit_rsvd = rsvd_bits(63, 63);
if (!gbpages)
gbpages_bit_rsvd = rsvd_bits(7, 7);
+ if (level == PT32E_ROOT_LEVEL)
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+ else
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+ /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+ if (!nx)
+ high_bits_rsvd |= rsvd_bits(63, 63);
+
/*
* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
* leaf entries) on AMD CPUs only.
@@ -4026,45 +4035,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
break;
case PT32E_ROOT_LEVEL:
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
- rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PDE */
- rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PTE */
- rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62) |
- rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ high_bits_rsvd |
+ rsvd_bits(5, 8) |
+ rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_5LEVEL:
- rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
- rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
rsvd_check->rsvd_bits_mask[1][4] =
rsvd_check->rsvd_bits_mask[0][4];
fallthrough;
case PT64_ROOT_4LEVEL:
- rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- gbpages_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
rsvd_check->rsvd_bits_mask[1][3] =
rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 29);
- rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
@@ -4075,8 +4078,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), context->root_level,
- context->nx,
+ vcpu->arch.reserved_gpa_bits,
+ context->root_level, context->nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu),
guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4084,27 +4087,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, bool execonly)
+ u64 pa_bits_rsvd, bool execonly)
{
+ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
u64 bad_mt_xwr;
- rsvd_check->rsvd_bits_mask[0][4] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][3] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
@@ -4123,7 +4121,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), execonly);
+ vcpu->arch.reserved_gpa_bits, execonly);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+ return rsvd_bits(shadow_phys_bits, 63);
}
/*
@@ -4145,7 +4148,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
*/
shadow_zero_check = &context->shadow_zero_check;
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- shadow_phys_bits,
+ reserved_hpa_bits(),
context->shadow_root_level, uses_nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu), true);
@@ -4182,14 +4185,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- shadow_phys_bits,
+ reserved_hpa_bits(),
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- shadow_phys_bits,
- false);
+ reserved_hpa_bits(), false);
if (!shadow_me_mask)
return;
@@ -4209,7 +4211,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- shadow_phys_bits, execonly);
+ reserved_hpa_bits(), execonly);
}
#define BYTE_MASK(access) \
@@ -4395,7 +4397,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->update_pte = paging64_update_pte;
context->shadow_root_level = level;
context->direct_map = false;
}
@@ -4424,7 +4425,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = false;
}
@@ -4506,7 +4506,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
context->direct_map = true;
context->get_guest_pgd = get_cr3;
@@ -4678,7 +4677,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
- context->update_pte = ept_update_pte;
context->root_level = level;
context->direct_map = false;
context->mmu_role.as_u64 = new_role.as_u64;
@@ -4811,7 +4809,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
kvm_mmu_load_pgd(vcpu);
- kvm_x86_ops.tlb_flush_current(vcpu);
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
out:
return r;
}
@@ -4826,19 +4824,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *new)
-{
- if (sp->role.level != PG_LEVEL_4K) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
-}
-
static bool need_remote_flush(u64 old, u64 new)
{
if (!is_shadow_present_pte(old))
@@ -4954,22 +4939,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-/*
- * Ignore various flags when determining if a SPTE can be immediately
- * overwritten for the current MMU.
- * - level: explicitly checked in mmu_pte_write_new_pte(), and will never
- * match the current MMU role, as MMU's level tracks the root level.
- * - access: updated based on the new guest PTE
- * - quadrant: handled by get_written_sptes()
- * - invalid: always false (loop only walks valid shadow pages)
- */
-static const union kvm_mmu_page_role role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .invalid = 0x1,
-};
-
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
struct kvm_page_track_notifier_node *node)
@@ -4999,7 +4968,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
mmu_topup_memory_caches(vcpu, true);
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5020,14 +4989,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
local_flush = true;
while (npte--) {
- u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
-
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
- if (gentry &&
- !((sp->role.word ^ base_role) & ~role_ign.word) &&
- rmap_can_add(vcpu))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
@@ -5035,7 +5000,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -5125,7 +5090,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
}
if (!mmu->invlpg)
@@ -5182,7 +5147,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (tlb_flush)
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5233,14 +5198,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (iterator.rmap)
flush |= fn(kvm, iterator.rmap);
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
flush = false;
}
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_rwlock_write(&kvm->mmu_lock);
}
}
@@ -5390,7 +5355,7 @@ restart:
* be in active use by the guest.
*/
if (batch >= BATCH_ZAP_PAGES &&
- cond_resched_lock(&kvm->mmu_lock)) {
+ cond_resched_rwlock_write(&kvm->mmu_lock)) {
batch = 0;
goto restart;
}
@@ -5423,7 +5388,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
lockdep_assert_held(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
trace_kvm_mmu_zap_all_fast(kvm);
/*
@@ -5447,10 +5412,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5492,7 +5457,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(memslot, slots) {
@@ -5510,13 +5475,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
}
- if (kvm->arch.tdp_mmu_enabled) {
+ if (is_tdp_mmu_enabled(kvm)) {
flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5531,12 +5496,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
/*
* We can flush all the TLBs out of the mmu lock without TLB
@@ -5596,13 +5561,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
kvm_mmu_zap_collapsible_spte, true);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
@@ -5625,11 +5590,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5647,12 +5612,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5664,11 +5629,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5681,23 +5646,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
LIST_HEAD(invalid_list);
int ign;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (WARN_ON(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
- if (cond_resched_lock(&kvm->mmu_lock))
+ if (cond_resched_rwlock_write(&kvm->mmu_lock))
goto restart;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -5757,7 +5722,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
continue;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvm_has_zapped_obsolete_pages(kvm)) {
kvm_mmu_commit_zap_page(kvm,
@@ -5768,7 +5733,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
/*
@@ -5988,7 +5953,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ulong to_zap;
rcu_idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
@@ -6005,22 +5970,22 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
struct kvm_mmu_page,
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
- if (sp->tdp_mmu_page)
+ if (is_tdp_mmu_page(sp)) {
kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
- else {
+ } else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
}
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_rwlock_write(&kvm->mmu_lock);
}
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index c8d51a37e2ce..ced15fd58fde 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
}
static bool mmu_audit;
-static struct static_key mmu_audit_key;
+static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
@@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
- if (static_key_false((&mmu_audit_key)))
+ if (static_branch_unlikely((&mmu_audit_key)))
__kvm_mmu_audit(vcpu, point);
}
@@ -259,7 +259,7 @@ static void mmu_audit_enable(void)
if (mmu_audit)
return;
- static_key_slow_inc(&mmu_audit_key);
+ static_branch_inc(&mmu_audit_key);
mmu_audit = true;
}
@@ -268,7 +268,7 @@ static void mmu_audit_disable(void)
if (!mmu_audit)
return;
- static_key_slow_dec(&mmu_audit_key);
+ static_branch_dec(&mmu_audit_key);
mmu_audit = false;
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bfc6389edc28..9e38d3c5daad 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -12,7 +12,7 @@
extern bool dbg;
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
#define MMU_WARN_ON(x) WARN_ON(x)
#else
#define pgprintk(x...) do { } while (0)
@@ -56,7 +56,12 @@ struct kvm_mmu_page {
/* Number of writes since the last time traversal visited this page. */
atomic_t write_flooding_count;
+#ifdef CONFIG_X86_64
bool tdp_mmu_page;
+
+ /* Used for freeing the page asyncronously if it is a TDP MMU page. */
+ struct rcu_head rcu_head;
+#endif
};
extern struct kmem_cache *mmu_page_header_cache;
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 8443a675715b..34bb0ec69bd8 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm,
head = &kvm->arch.track_notifier_head;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
@@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
head = &kvm->arch.track_notifier_head;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
hlist_del_rcu(&n->node);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..d9f66cc459e8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
}
r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
@@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return r;
}
@@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
level = iterator.level;
sptep = iterator.sptep;
@@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c51ad544f25b..ef55f0bc4ccf 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -120,7 +120,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
+ spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
if (host_writable)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 2b3a30bd38b0..6de3950fd704 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -131,6 +131,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * This constant works because it is considered non-present on both AMD and
+ * Intel CPUs and does not create a L1TF vulnerability because the pfn section
+ * is zeroed out.
+ *
+ * Only used by the TDP MMU.
+ */
+#define REMOVED_SPTE (1ull << 59)
+
+static inline bool is_removed_spte(u64 spte)
+{
+ return spte == REMOVED_SPTE;
+}
+
+/*
* In some cases, we need to preserve the GFN of a non-present or reserved
* SPTE when we usurp the upper five bits of the physical address space to
* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
@@ -185,23 +204,19 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline int is_shadow_present_pte(u64 pte)
+static inline bool is_shadow_present_pte(u64 pte)
{
- return (pte != 0) && !is_mmio_spte(pte);
+ return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
}
-static inline int is_large_pte(u64 pte)
+static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
}
-static inline int is_last_spte(u64 pte, int level)
+static inline bool is_last_spte(u64 pte, int level)
{
- if (level == PG_LEVEL_4K)
- return 1;
- if (is_large_pte(pte))
- return 1;
- return 0;
+ return (level == PG_LEVEL_4K) || is_large_pte(pte);
}
static inline bool is_executable_pte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 87b7e16911db..e5f148106e20 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
}
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -22,21 +22,22 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
- * rooted at root_pt, starting with the walk to translate goal_gfn.
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
- int min_level, gfn_t goal_gfn)
+ int min_level, gfn_t next_last_level_gfn)
{
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
- iter->goal_gfn = goal_gfn;
+ iter->next_last_level_gfn = next_last_level_gfn;
+ iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
iter->level = root_level;
- iter->pt_path[iter->level - 1] = root_pt;
+ iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->valid = true;
@@ -47,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
* address of the child page table referenced by the SPTE. Returns null if
* there is no such entry.
*/
-u64 *spte_to_child_pt(u64 spte, int level)
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
{
/*
* There's no child entry if this entry isn't present or is a
@@ -56,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
return NULL;
- return __va(spte_to_pfn(spte) << PAGE_SHIFT);
+ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}
/*
@@ -65,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
*/
static bool try_step_down(struct tdp_iter *iter)
{
- u64 *child_pt;
+ tdp_ptep_t child_pt;
if (iter->level == iter->min_level)
return false;
@@ -74,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter)
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
@@ -82,7 +83,7 @@ static bool try_step_down(struct tdp_iter *iter)
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
@@ -106,9 +107,9 @@ static bool try_step_side(struct tdp_iter *iter)
return false;
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
- iter->goal_gfn = iter->gfn;
+ iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
return true;
}
@@ -158,24 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
}
-/*
- * Restart the walk over the paging structure from the root, starting from the
- * highest gfn the iterator had previously reached. Assumes that the entire
- * paging structure, except the root page, may have been completely torn down
- * and rebuilt.
- */
-void tdp_iter_refresh_walk(struct tdp_iter *iter)
-{
- gfn_t goal_gfn = iter->goal_gfn;
-
- if (iter->gfn > goal_gfn)
- goal_gfn = iter->gfn;
-
- tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
- iter->root_level, iter->min_level, goal_gfn);
-}
-
-u64 *tdp_iter_root_pt(struct tdp_iter *iter)
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
{
return iter->pt_path[iter->root_level - 1];
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 47170d0dc98e..4cc177d75c4a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -7,6 +7,8 @@
#include "mmu.h"
+typedef u64 __rcu *tdp_ptep_t;
+
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
@@ -15,11 +17,17 @@ struct tdp_iter {
* The iterator will traverse the paging structure towards the mapping
* for this GFN.
*/
- gfn_t goal_gfn;
+ gfn_t next_last_level_gfn;
+ /*
+ * The next_last_level_gfn at the time when the thread last
+ * yielded. Only yielding when the next_last_level_gfn !=
+ * yielded_gfn helps ensure forward progress.
+ */
+ gfn_t yielded_gfn;
/* Pointers to the page tables traversed to reach the current SPTE */
- u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
- u64 *sptep;
+ tdp_ptep_t sptep;
/* The lowest GFN mapped by the current SPTE */
gfn_t gfn;
/* The level of the root page given to the iterator */
@@ -49,12 +57,11 @@ struct tdp_iter {
#define for_each_tdp_pte(iter, root, root_level, start, end) \
for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
-u64 *spte_to_child_pt(u64 pte, int level);
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
- int min_level, gfn_t goal_gfn);
+ int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
-void tdp_iter_refresh_walk(struct tdp_iter *iter);
-u64 *tdp_iter_root_pt(struct tdp_iter *iter);
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index b56d604809b8..71e100a5670f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -7,32 +7,23 @@
#include "tdp_mmu.h"
#include "spte.h"
+#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-#ifdef CONFIG_X86_64
static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-#endif
-
-static bool is_tdp_mmu_enabled(void)
-{
-#ifdef CONFIG_X86_64
- return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
-#else
- return false;
-#endif /* CONFIG_X86_64 */
-}
/* Initializes the TDP MMU for the VM, if enabled. */
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
- if (!is_tdp_mmu_enabled())
+ if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
return;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
@@ -42,6 +33,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
return;
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+ * Ensure that all the outstanding RCU callbacks to free shadow pages
+ * can run before the VM is torn down.
+ */
+ rcu_barrier();
}
static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
@@ -53,7 +50,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
struct kvm_mmu_page *root)
{
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
return false;
@@ -88,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
#define for_each_tdp_mmu_root(_kvm, _root) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
-{
- struct kvm_mmu_page *sp;
-
- if (!kvm->arch.tdp_mmu_enabled)
- return false;
- if (WARN_ON(!VALID_PAGE(hpa)))
- return false;
-
- sp = to_shadow_page(hpa);
- if (WARN_ON(!sp))
- return false;
-
- return sp->tdp_mmu_page && sp->root_count;
-}
-
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield);
@@ -111,7 +92,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(root->root_count);
WARN_ON(!root->tdp_mmu_page);
@@ -164,13 +145,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root) {
if (root->role.word == role.word) {
kvm_mmu_get_root(kvm, root);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return root;
}
}
@@ -180,7 +161,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
list_add(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return root;
}
@@ -196,8 +177,31 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
return __pa(root->spt);
}
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+
+ tdp_mmu_free_sp(sp);
+}
+
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level);
+ u64 old_spte, u64 new_spte, int level,
+ bool shared);
static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
@@ -235,6 +239,128 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
+ * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the new page
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ * @account_nx: This page replaces a NX large page and should be marked for
+ * eventual reclaim.
+ */
+static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared, bool account_nx)
+{
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ */
+static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
+{
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ list_del(&sp->link);
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ * of the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ */
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+ bool shared)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(pt);
+ int level = sp->role.level;
+ gfn_t base_gfn = sp->gfn;
+ u64 old_child_spte;
+ u64 *sptep;
+ gfn_t gfn;
+ int i;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+
+ tdp_mmu_unlink_page(kvm, sp, shared);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ sptep = pt + i;
+ gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+
+ if (shared) {
+ /*
+ * Set the SPTE to a nonpresent value that other
+ * threads will not overwrite. If the SPTE was
+ * already marked as removed then another thread
+ * handling a page fault could overwrite it, so
+ * set the SPTE until it is set from some other
+ * value to the removed SPTE value.
+ */
+ for (;;) {
+ old_child_spte = xchg(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_child_spte))
+ break;
+ cpu_relax();
+ }
+ } else {
+ old_child_spte = READ_ONCE(*sptep);
+
+ /*
+ * Marking the SPTE as a removed SPTE is not
+ * strictly necessary here as the MMU lock will
+ * stop other threads from concurrently modifying
+ * this SPTE. Using the removed SPTE value keeps
+ * the two branches consistent and simplifies
+ * the function.
+ */
+ WRITE_ONCE(*sptep, REMOVED_SPTE);
+ }
+ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+ old_child_spte, REMOVED_SPTE, level - 1,
+ shared);
+ }
+
+ kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ KVM_PAGES_PER_HPAGE(level));
+
+ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
@@ -242,22 +368,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
* @old_spte: The value of the SPTE before the change
* @new_spte: The value of the SPTE after the change
* @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
*
* Handle bookkeeping that might result from the modification of a SPTE.
* This function must be called for all TDP SPTE modifications.
*/
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
- u64 *pt;
- struct kvm_mmu_page *sp;
- u64 old_child_spte;
- int i;
WARN_ON(level > PT64_ROOT_MAX_LEVEL);
WARN_ON(level < PG_LEVEL_4K);
@@ -298,15 +424,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE, it is
- * unexpected. Log the change, though it should not impact the
- * guest since both the former and current SPTEs are nonpresent.
+ * If this change does not involve a MMIO SPTE or removed SPTE,
+ * it is unexpected. Log the change, though it should not
+ * impact the guest since both the former and current SPTEs
+ * are nonpresent.
*/
- if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
+ if (WARN_ON(!is_mmio_spte(old_spte) &&
+ !is_mmio_spte(new_spte) &&
+ !is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
- "are MMIO SPTEs.\n"
+ "are MMIO SPTEs, or the new SPTE is\n"
+ "a temporary removed SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -321,54 +451,127 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Recursively handle child PTs if the change removed a subtree from
* the paging structure.
*/
- if (was_present && !was_leaf && (pfn_changed || !is_present)) {
- pt = spte_to_child_pt(old_spte, level);
- sp = sptep_to_sp(pt);
+ if (was_present && !was_leaf && (pfn_changed || !is_present))
+ handle_removed_tdp_mmu_page(kvm,
+ spte_to_child_pt(old_spte, level), shared);
+}
- trace_kvm_mmu_prepare_zap_page(sp);
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
+{
+ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
+ shared);
+ handle_changed_spte_acc_track(old_spte, new_spte, level);
+ handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+ new_spte, level);
+}
- list_del(&sp->link);
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
+ * associated bookkeeping
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ * this function will have no side-effects.
+ */
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ u64 *root_pt = tdp_iter_root_pt(iter);
+ struct kvm_mmu_page *root = sptep_to_sp(root_pt);
+ int as_id = kvm_mmu_page_as_id(root);
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ lockdep_assert_held_read(&kvm->mmu_lock);
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- old_child_spte = READ_ONCE(*(pt + i));
- WRITE_ONCE(*(pt + i), 0);
- handle_changed_spte(kvm, as_id,
- gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
- old_child_spte, 0, level - 1);
- }
+ /*
+ * Do not change removed SPTEs. Only the thread that froze the SPTE
+ * may modify it.
+ */
+ if (iter->old_spte == REMOVED_SPTE)
+ return false;
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
+ new_spte) != iter->old_spte)
+ return false;
- free_page((unsigned long)pt);
- kmem_cache_free(mmu_page_header_cache, sp);
- }
+ handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
+ iter->level, true);
+
+ return true;
}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
+static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ /*
+ * Freeze the SPTE by setting it to a special,
+ * non-present value. This will stop other threads from
+ * immediately installing a present entry in its place
+ * before the TLBs are flushed.
+ */
+ if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+ return false;
+
+ kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
+ KVM_PAGES_PER_HPAGE(iter->level));
+
+ /*
+ * No other thread can overwrite the removed SPTE as they
+ * must either wait on the MMU lock or use
+ * tdp_mmu_set_spte_atomic which will not overrite the
+ * special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present
+ * to non-present.
+ */
+ WRITE_ONCE(*iter->sptep, 0);
+
+ return true;
}
+
+/*
+ * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * @record_acc_track: Notify the MM subsystem of changes to the accessed state
+ * of the page. Should be set unless handling an MMU
+ * notifier for access tracking. Leaving record_acc_track
+ * unset in that case prevents page accesses from being
+ * double counted.
+ * @record_dirty_log: Record the page as dirty in the dirty bitmap if
+ * appropriate for the change being made. Should be set
+ * unless performing certain dirty logging operations.
+ * Leaving record_dirty_log unset in that case prevents page
+ * writes from being double counted.
+ */
static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
- u64 *root_pt = tdp_iter_root_pt(iter);
+ tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
struct kvm_mmu_page *root = sptep_to_sp(root_pt);
int as_id = kvm_mmu_page_as_id(root);
- WRITE_ONCE(*iter->sptep, new_spte);
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * No thread should be using this function to set SPTEs to the
+ * temporary removed SPTE value.
+ * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+ * should be used. If operating under the MMU lock in write mode, the
+ * use of the removed SPTE should not be necessary.
+ */
+ WARN_ON(iter->old_spte == REMOVED_SPTE);
+
+ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
__handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level);
+ iter->level, false);
if (record_acc_track)
handle_changed_spte_acc_track(iter->old_spte, new_spte,
iter->level);
@@ -413,27 +616,46 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
_mmu->shadow_root_level, _start, _end)
/*
- * Flush the TLB if the process should drop kvm->mmu_lock.
- * Return whether the caller still needs to flush the tlb.
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, it will also reset the tdp_iter's walk over the
+ * paging structure and the calling function should skip to the next
+ * iteration to allow the iterator to continue its traversal from the
+ * paging structure root.
+ *
+ * Return true if this function yielded and the iterator's traversal was reset.
+ * Return false if a yield was not needed.
*/
-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
+static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter, bool flush)
{
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_flush_remote_tlbs(kvm);
- cond_resched_lock(&kvm->mmu_lock);
- tdp_iter_refresh_walk(iter);
+ /* Ensure forward progress has been made before yielding. */
+ if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
- } else {
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ rcu_read_unlock();
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ rcu_read_lock();
+
+ WARN_ON(iter->gfn > iter->next_last_level_gfn);
+
+ tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
+ iter->root_level, iter->min_level,
+ iter->next_last_level_gfn);
+
return true;
}
-}
-static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
-{
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- cond_resched_lock(&kvm->mmu_lock);
- tdp_iter_refresh_walk(iter);
- }
+ return false;
}
/*
@@ -453,7 +675,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
bool flush_needed = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
+ if (can_yield &&
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
+ flush_needed = false;
+ continue;
+ }
+
if (!is_shadow_present_pte(iter.old_spte))
continue;
@@ -468,12 +698,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
-
- if (can_yield)
- flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
- else
- flush_needed = true;
+ flush_needed = true;
}
+
+ rcu_read_unlock();
return flush_needed;
}
@@ -517,21 +745,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
int ret = 0;
int make_spte_ret = 0;
- if (unlikely(is_noslot_pfn(pfn))) {
+ if (unlikely(is_noslot_pfn(pfn)))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
- trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
- } else {
+ else
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
pfn, iter->old_spte, prefault, true,
map_writable, !shadow_accessed_mask,
&new_spte);
- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
- }
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else
- tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
+ else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ return RET_PF_RETRY;
/*
* If the page fault was caused by a write but the page is write
@@ -545,10 +770,16 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte)))
+ if (unlikely(is_mmio_spte(new_spte))) {
+ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+ new_spte);
ret = RET_PF_EMULATE;
+ } else
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
if (!prefault)
vcpu->stat.pf_fixed++;
@@ -586,6 +817,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
+
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(iter.old_spte, gfn,
@@ -601,49 +835,61 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
- KVM_PAGES_PER_HPAGE(iter.level));
+ if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ break;
/*
* The iter must explicitly re-read the spte here
* because the new value informs the !present
* path below.
*/
- iter.old_spte = READ_ONCE(*iter.sptep);
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
}
if (!is_shadow_present_pte(iter.old_spte)) {
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
- list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
child_pt = sp->spt;
- clear_page(child_pt);
+
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- trace_kvm_mmu_get_page(sp, true);
- if (huge_page_disallowed && req_level >= iter.level)
- account_huge_nx_page(vcpu->kvm, sp);
-
- tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
+ if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
+ new_spte)) {
+ tdp_mmu_link_page(vcpu->kvm, sp, true,
+ huge_page_disallowed &&
+ req_level >= iter.level);
+
+ trace_kvm_mmu_get_page(sp, true);
+ } else {
+ tdp_mmu_free_sp(sp);
+ break;
+ }
}
}
- if (WARN_ON(iter.level != level))
+ if (iter.level != level) {
+ rcu_read_unlock();
return RET_PF_RETRY;
+ }
ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
pfn, prefault);
+ rcu_read_unlock();
return ret;
}
-static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end, unsigned long data,
- int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long data))
+static __always_inline int
+kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_mmu_page *root,
+ gfn_t start,
+ gfn_t end,
+ unsigned long data))
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -705,6 +951,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
int young = 0;
u64 new_spte = 0;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
/*
* If we have a non-accessed entry we don't need to change the
@@ -736,6 +984,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
trace_kvm_age_page(iter.gfn, iter.level, slot, young);
}
+ rcu_read_unlock();
+
return young;
}
@@ -781,6 +1031,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
u64 new_spte;
int need_flush = 0;
+ rcu_read_lock();
+
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
@@ -809,6 +1061,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
if (need_flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ rcu_read_unlock();
+
return 0;
}
@@ -832,21 +1086,27 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
if (!is_shadow_present_pte(iter.old_spte) ||
- !is_last_spte(iter.old_spte, iter.level))
+ !is_last_spte(iter.old_spte, iter.level) ||
+ !(iter.old_spte & PT_WRITABLE_MASK))
continue;
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+
+ rcu_read_unlock();
return spte_set;
}
@@ -888,7 +1148,12 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -903,9 +1168,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+
+ rcu_read_unlock();
return spte_set;
}
@@ -947,6 +1212,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
u64 new_spte;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
@@ -956,6 +1223,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
!(mask & (1UL << (iter.gfn - gfn))))
continue;
+ mask &= ~(1UL << (iter.gfn - gfn));
+
if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -969,9 +1238,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
}
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
-
- mask &= ~(1UL << (iter.gfn - gfn));
}
+
+ rcu_read_unlock();
}
/*
@@ -989,7 +1258,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_mmu_page *root;
int root_as_id;
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
@@ -1011,18 +1280,23 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
- if (!is_shadow_present_pte(iter.old_spte))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ iter.old_spte & shadow_dirty_mask)
continue;
new_spte = iter.old_spte | shadow_dirty_mask;
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+ rcu_read_unlock();
return spte_set;
}
@@ -1060,7 +1334,14 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
kvm_pfn_t pfn;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
+ spte_set = false;
+ continue;
+ }
+
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -1072,9 +1353,10 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
tdp_mmu_set_spte(kvm, &iter, 0);
- spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
+ spte_set = true;
}
+ rcu_read_unlock();
if (spte_set)
kvm_flush_remote_tlbs(kvm);
}
@@ -1111,6 +1393,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
if (!is_writable_pte(iter.old_spte))
break;
@@ -1122,6 +1406,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
spte_set = true;
}
+ rcu_read_unlock();
+
return spte_set;
}
@@ -1137,7 +1423,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int root_as_id;
bool spte_set = false;
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
@@ -1162,10 +1448,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->shadow_root_level;
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
+ rcu_read_unlock();
+
return leaf;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index cbbdbadd1526..b4b65e3699b3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -5,10 +5,6 @@
#include <linux/kvm_host.h>
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
@@ -47,4 +43,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
+#ifdef CONFIG_X86_64
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!is_tdp_mmu_enabled(kvm))
+ return false;
+ if (WARN_ON(!VALID_PAGE(hpa)))
+ return false;
+
+ sp = to_shadow_page(hpa);
+ if (WARN_ON(!sp))
+ return false;
+
+ return is_tdp_mmu_page(sp) && sp->root_count;
+}
+
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index f472fdb6ae7e..a8502e02f479 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* variable MTRRs */
WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
- mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
/* MTRR base */
if (!valid_mtrr_type(data & 0xff))
@@ -351,14 +351,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
- /* Extend the mask with all 1 bits to the left, since those
- * bits must implicitly be 0. The bits are then cleared
- * when reading them.
+ /*
+ * Set all illegal GPA bits in the mask, since those bits must
+ * implicitly be 0. The bits are then cleared when reading them.
*/
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+ cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
@@ -426,7 +426,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
- *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+ *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
}
return 0;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 67741d2a0308..827886c12c16 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -373,7 +373,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
return 1;
if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
- (kvm_x86_ops.get_cpl(vcpu) != 0) &&
+ (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
(kvm_read_cr0(vcpu) & X86_CR0_PE))
return 1;
@@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- if (lapic_in_kernel(vcpu))
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_x86_ops.pmu_ops->deliver_pmi)
+ kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ }
}
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -473,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
}
+ if (kvm_x86_ops.pmu_ops->cleanup)
+ kvm_x86_ops.pmu_ops->cleanup(vcpu);
+
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 067fef51760c..7b30bc967af3 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
void (*refresh)(struct kvm_vcpu *vcpu);
void (*init)(struct kvm_vcpu *vcpu);
void (*reset)(struct kvm_vcpu *vcpu);
+ void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+ void (*cleanup)(struct kvm_vcpu *vcpu);
};
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 0ef84d57b72e..78bdcfac4e40 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -298,6 +298,23 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, source,
+ icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
{
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
@@ -324,28 +341,14 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
- case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm *kvm = svm->vcpu.kvm;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
* At this point, we expect that the AVIC HW has already
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
- }
+ avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
break;
- }
case AVIC_IPI_FAILURE_INVALID_TARGET:
WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
index, svm->vcpu.vcpu_id, icrh, icrl);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index db30670dd8c4..cc91738ab445 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -58,7 +58,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
@@ -248,7 +248,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
if (vmcb12_lma) {
if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
!(vmcb12->save.cr0 & X86_CR0_PE) ||
- (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+ kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
return false;
}
if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
@@ -345,7 +345,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt)
{
- if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
+ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return -EINVAL;
if (!nested_npt && is_pae_paging(vcpu) &&
@@ -392,7 +392,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
svm->vmcb->save.cpl = vmcb12->save.cpl;
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..874ea309279f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -22,6 +22,7 @@
#include "x86.h"
#include "svm.h"
+#include "svm_ops.h"
#include "cpuid.h"
#include "trace.h"
@@ -1041,6 +1042,74 @@ e_unpin_memory:
return ret;
}
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_attestation_report *data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1091,6 +1160,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_LAUNCH_SECRET:
r = sev_launch_secret(kvm, &sev_cmd);
break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -1994,29 +2066,17 @@ void sev_es_create_vcpu(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
struct vmcb_save_area *hostsa;
- unsigned int i;
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
* of which one step is to perform a VMLOAD. Since hardware does not
* perform a VMSAVE on VMRUN, the host savearea must be updated.
*/
- asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory");
-
- /*
- * Certain MSRs are restored on VMEXIT, only save ones that aren't
- * restored.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
- if (host_save_user_msrs[i].sev_es_restored)
- continue;
-
- rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
- }
+ vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
@@ -2029,22 +2089,6 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
hostsa->xss = host_xss;
}
-void sev_es_vcpu_put(struct vcpu_svm *svm)
-{
- unsigned int i;
-
- /*
- * Certain MSRs are restored on VMEXIT and were saved with vmsave in
- * sev_es_vcpu_load() above. Only restore ones that weren't.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
- if (host_save_user_msrs[i].sev_es_restored)
- continue;
-
- wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
- }
-}
-
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
struct vcpu_svm *svm = to_svm(vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3442d44ca53b..adb3619a3c16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "svm.h"
+#include "svm_ops.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -200,9 +201,9 @@ module_param(sev_es, int, 0444);
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
-static u8 rsm_ins_bytes[] = "\x0f\xaa";
+static bool svm_gp_erratum_intercept = true;
-static void svm_complete_interrupts(struct vcpu_svm *svm);
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
static unsigned long iopm_base;
@@ -246,21 +247,6 @@ u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static inline void clgi(void)
-{
- asm volatile (__ex("clgi"));
-}
-
-static inline void stgi(void)
-{
- asm volatile (__ex("stgi"));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
-}
-
static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -288,6 +274,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!(efer & EFER_SVME)) {
svm_leave_nested(svm);
svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
/*
* Free the nested guest state, unless we are in SMM.
@@ -304,6 +293,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.efer = old_efer;
return ret;
}
+
+ if (svm_gp_erratum_intercept)
+ set_exception_intercept(svm, GP_VECTOR);
}
}
@@ -925,6 +917,9 @@ static __init void svm_set_cpu_caps(void)
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
@@ -1032,6 +1027,9 @@ static __init int svm_hardware_setup(void)
}
}
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
@@ -1207,7 +1205,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, 2);
+ kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1366,6 +1364,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->vmsa = page_address(vmsa_page);
svm->asid_generation = 0;
+ svm->guest_state_loaded = false;
init_vmcb(svm);
svm_init_osvw(vcpu);
@@ -1413,30 +1412,31 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
}
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- int i;
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ unsigned int i;
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
+ if (svm->guest_state_loaded)
+ return;
+ /*
+ * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+ * area (non-sev-es). Save ones that aren't so we can restore them
+ * individually later.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_vcpu_load(svm, cpu);
+ sev_es_prepare_guest_switch(svm, vcpu->cpu);
} else {
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i].index,
- svm->host_user_msrs[i]);
+ vmsave(__sme_page_pa(sd->save_area));
}
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
@@ -1446,10 +1446,42 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
}
}
+
/* This assumes that the kernel never uses MSR_TSC_AUX */
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned int i;
+
+ if (!svm->guest_state_loaded)
+ return;
+
+ /*
+ * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+ * area (non-sev-es). Restore the ones that weren't.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ svm->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+ if (unlikely(cpu != vcpu->cpu)) {
+ svm->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ }
+
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
@@ -1459,30 +1491,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
avic_vcpu_put(vcpu);
+ svm_prepare_host_switch(vcpu);
++vcpu->stat.host_state_reload;
- if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_vcpu_put(svm);
- } else {
- kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
- load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i].index,
- svm->host_user_msrs[i]);
- }
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1815,7 +1827,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1865,7 +1877,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
get_debugreg(vcpu->arch.db[2], 2);
get_debugreg(vcpu->arch.db[3], 3);
/*
- * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
* because db_interception might need it. We can do it before vmentry.
*/
vcpu->arch.dr6 = svm->vmcb->save.dr6;
@@ -1916,7 +1928,7 @@ static int db_interception(struct vcpu_svm *svm)
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
- u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
return 1;
}
@@ -1962,24 +1974,6 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
}
-static int gp_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- u32 error_code = svm->vmcb->control.exit_info_1;
-
- WARN_ON_ONCE(!enable_vmware_backdoor);
-
- /*
- * VMware backdoor emulation on #GP interception only handles IN{S},
- * OUT{S}, and RDPMC, none of which generate a non-zero error code.
- */
- if (error_code) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
- }
- return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
-}
-
static bool is_erratum_383(void)
{
int err, i;
@@ -2178,6 +2172,102 @@ static int vmrun_interception(struct vcpu_svm *svm)
return nested_svm_vmrun(svm);
}
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ return nested_svm_vmexit(svm);
+ } else
+ return svm_instr_handlers[opcode](svm);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else
+ return emulate_svm_instr(vcpu, opcode);
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
void svm_set_gif(struct vcpu_svm *svm, bool value)
{
if (value) {
@@ -2265,11 +2355,8 @@ static int xsetbv_interception(struct vcpu_svm *svm)
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
u32 index = kvm_rcx_read(&svm->vcpu);
- if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
-
- return 1;
+ int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int rdpru_interception(struct vcpu_svm *svm)
@@ -2530,6 +2617,7 @@ static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
unsigned long val;
+ int err = 0;
if (svm->vcpu.guest_debug == 0) {
/*
@@ -2547,20 +2635,16 @@ static int dr_interception(struct vcpu_svm *svm)
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
- if (dr >= 16) { /* mov to DRn */
- if (!kvm_require_dr(&svm->vcpu, dr - 16))
- return 1;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
val = kvm_register_read(&svm->vcpu, reg);
- kvm_set_dr(&svm->vcpu, dr - 16, val);
+ err = kvm_set_dr(&svm->vcpu, dr, val);
} else {
- if (!kvm_require_dr(&svm->vcpu, dr))
- return 1;
kvm_get_dr(&svm->vcpu, dr, &val);
kvm_register_write(&svm->vcpu, reg, val);
}
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int cr8_write_interception(struct vcpu_svm *svm)
@@ -3354,7 +3438,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3479,7 +3563,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !svm_interrupt_blocked(vcpu);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3503,7 +3587,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
}
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3560,10 +3644,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
invlpga(gva, svm->vmcb->control.asid);
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3708,16 +3788,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (sev_es_guest(svm->vcpu.kvm)) {
__svm_sev_es_vcpu_run(svm->vmcb_pa);
} else {
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+
__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
-#ifdef CONFIG_X86_64
- native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
+ vmload(__sme_page_pa(sd->save_area));
}
/*
@@ -3783,7 +3858,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
svm_set_dr6(svm, vcpu->arch.dr6);
else
- svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);
+ svm_set_dr6(svm, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -3978,7 +4053,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
if (best)
- vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
if (!kvm_vcpu_apicv_active(vcpu))
@@ -4285,7 +4360,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
return ret;
}
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4439,7 +4514,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
- .update_exception_bitmap = update_exception_bitmap,
+ .update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
@@ -4482,9 +4557,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
.set_nmi_mask = svm_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
@@ -4527,7 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.smi_allowed = svm_smi_allowed,
.pre_enter_smm = svm_pre_enter_smm,
.pre_leave_smm = svm_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .enable_smi_window = svm_enable_smi_window,
.mem_enc_op = svm_mem_enc_op,
.mem_enc_reg_region = svm_register_enc_region,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6e7d070f8b86..39e071fdab0c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -23,22 +23,8 @@
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
-static const struct svm_host_save_msrs {
- u32 index; /* Index of the MSR */
- bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */
-} host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- { .index = MSR_STAR, .sev_es_restored = true },
- { .index = MSR_LSTAR, .sev_es_restored = true },
- { .index = MSR_CSTAR, .sev_es_restored = true },
- { .index = MSR_SYSCALL_MASK, .sev_es_restored = true },
- { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true },
- { .index = MSR_FS_BASE, .sev_es_restored = true },
-#endif
- { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true },
- { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true },
- { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true },
- { .index = MSR_TSC_AUX, .sev_es_restored = false },
+static const u32 host_save_user_msrs[] = {
+ MSR_TSC_AUX,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
@@ -130,12 +116,6 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- struct {
- u16 fs;
- u16 gs;
- u16 ldt;
- u64 gs_base;
- } host;
u64 spec_ctrl;
/*
@@ -192,6 +172,8 @@ struct vcpu_svm {
u64 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
+
+ bool guest_state_loaded;
};
struct svm_cpu_data {
@@ -587,9 +569,8 @@ int sev_handle_vmgexit(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu);
-void sev_es_vcpu_put(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644
index 000000000000..8170f2a5a16f
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include <asm/kvm_host.h>
+
+#define svm_asm(insn, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ ::: clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1, op2 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static inline void clgi(void)
+{
+ svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+ svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static inline void vmsave(unsigned long pa)
+{
+ svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+static inline void vmload(unsigned long pa)
+{
+ svm_asm1(vmload, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2de30c20bc26..a61c015870e3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -93,6 +93,42 @@ TRACE_EVENT(kvm_hv_hypercall,
);
/*
+ * Tracepoint for Xen hypercall.
+ */
+TRACE_EVENT(kvm_xen_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3, unsigned long a4,
+ unsigned long a5),
+ TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr)
+ __field(unsigned long, a0)
+ __field(unsigned long, a1)
+ __field(unsigned long, a2)
+ __field(unsigned long, a3)
+ __field(unsigned long, a4)
+ __field(unsigned long, a5)
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ __entry->a4 = a4;
+ __entry->a4 = a5;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3, __entry->a4, __entry->a5)
+);
+
+
+
+/*
* Tracepoint for PIO.
*/
@@ -256,7 +292,7 @@ TRACE_EVENT(name, \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \
+ static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \
&__entry->info2, \
&__entry->intr_info, \
&__entry->error_code); \
@@ -738,7 +774,7 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
__entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
- vcpu->arch.emulate_ctxt->fetch.data;
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 3a1861403d73..d1d77985e889 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -19,6 +19,9 @@ extern int __read_mostly pt_mode;
#define PT_MODE_HOST_GUEST 1
#define PMU_CAP_FW_WRITES (1ULL << 13)
+#define PMU_CAP_LBR_FMT 0x3f
+
+#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
struct nested_vmx_msrs {
/*
@@ -262,6 +265,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
SECONDARY_EXEC_TSC_SCALING;
}
+static inline bool cpu_has_vmx_bus_lock_detection(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
+}
+
static inline bool cpu_has_vmx_apicv(void)
{
return cpu_has_vmx_apic_register_virt() &&
@@ -371,11 +380,28 @@ static inline bool vmx_pt_mode_is_host_guest(void)
static inline u64 vmx_get_perf_capabilities(void)
{
+ u64 perf_cap = 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+ perf_cap &= PMU_CAP_LBR_FMT;
+
/*
* Since counters are virtualized, KVM would support full
* width counting unconditionally, even if the host lacks it.
*/
- return PMU_CAP_FW_WRITES;
+ return PMU_CAP_FW_WRITES | perf_cap;
+}
+
+static inline u64 vmx_supported_debugctl(void)
+{
+ u64 debugctl = 0;
+
+ if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
+ debugctl |= DEBUGCTLMSR_LBR_MASK;
+
+ return debugctl;
}
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f2b9bfb58206..b2f0b5e9cd63 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "nested.h"
#include "pmu.h"
#include "trace.h"
+#include "vmx.h"
#include "x86.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -411,8 +412,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
if (nr == DB_VECTOR) {
if (!has_payload) {
payload = vcpu->arch.dr6;
- payload &= ~(DR6_FIXED_1 | DR6_BT);
- payload ^= DR6_RTM;
+ payload &= ~DR6_BT;
+ payload ^= DR6_ACTIVE_LOW;
}
*exit_qual = payload;
} else
@@ -744,8 +745,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
(CC(!nested_cpu_has_vid(vmcs12)) ||
CC(!nested_exit_intr_ack_set(vcpu)) ||
CC((vmcs12->posted_intr_nv & 0xff00)) ||
- CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
- CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
+ CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
@@ -758,13 +758,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
u32 count, u64 addr)
{
- int maxphyaddr;
-
if (count == 0)
return 0;
- maxphyaddr = cpuid_maxphyaddr(vcpu);
- if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
- (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
+
+ if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
return -EINVAL;
return 0;
@@ -1062,14 +1060,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
}
}
-static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
- unsigned long invalid_mask;
-
- invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
- return (val & invalid_mask) == 0;
-}
-
/*
* Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
* tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
@@ -1121,7 +1111,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
enum vm_entry_failure_code *entry_failure_code)
{
- if (CC(!nested_cr3_valid(vcpu, cr3))) {
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -2532,7 +2522,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
*/
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
@@ -2635,7 +2625,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
/* Check for memory type validity */
switch (new_eptp & VMX_EPTP_MT_MASK) {
@@ -2666,7 +2655,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
}
/* Reserved bits should not be set */
- if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
@@ -2850,7 +2839,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
- CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
return -EINVAL;
if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
@@ -3057,35 +3046,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- asm(
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "je 1f \n\t"
- __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
- "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Check if vmlaunch or vmresume is needed */
- "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
-
- /*
- * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
- * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
- * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
- * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
- */
- "call vmx_vmenter\n\t"
-
- CC_SET(be)
- : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
- : [HOST_RSP]"r"((unsigned long)HOST_RSP),
- [loaded_vmcs]"r"(vmx->loaded_vmcs),
- [launched]"i"(offsetof(struct loaded_vmcs, launched)),
- [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
- [wordsize]"i"(sizeof(ulong))
- : "memory"
- );
+ vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -3330,7 +3292,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
enum vm_entry_failure_code entry_failure_code;
bool evaluate_pending_interrupts;
- u32 exit_reason, failed_index;
+ union vmx_exit_reason exit_reason = {
+ .basic = EXIT_REASON_INVALID_STATE,
+ .failed_vmentry = 1,
+ };
+ u32 failed_index;
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
@@ -3382,7 +3348,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (nested_vmx_check_guest_state(vcpu, vmcs12,
&entry_failure_code)) {
- exit_reason = EXIT_REASON_INVALID_STATE;
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit;
}
@@ -3393,7 +3359,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
- exit_reason = EXIT_REASON_INVALID_STATE;
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit_guest_mode;
}
@@ -3403,7 +3369,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
if (failed_index) {
- exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+ exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
vmcs12->exit_qualification = failed_index;
goto vmentry_fail_vmexit_guest_mode;
}
@@ -3471,7 +3437,7 @@ vmentry_fail_vmexit:
return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
- vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->vm_exit_reason = exit_reason.full;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
@@ -5559,7 +5525,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
fail:
- nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ /*
+ * This is effectively a reflected VM-Exit, as opposed to a synthesized
+ * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
+ * EXIT_REASON_VMFUNC as the exit reason.
+ */
+ nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
vmx_get_intr_info(vcpu),
vmx_get_exit_qual(vcpu));
return 1;
@@ -5627,7 +5598,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
*/
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12, u32 exit_reason)
+ struct vmcs12 *vmcs12,
+ union vmx_exit_reason exit_reason)
{
u32 msr_index = kvm_rcx_read(vcpu);
gpa_t bitmap;
@@ -5641,7 +5613,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
* First we need to figure out which of the four to use:
*/
bitmap = vmcs12->msr_bitmap;
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
bitmap += 2048;
if (msr_index >= 0xc0000000) {
msr_index -= 0xc0000000;
@@ -5778,11 +5750,12 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
* Return true if L0 wants to handle an exit from L2 regardless of whether or not
* L1 wants the exit. Only call this when in is_guest_mode (L2).
*/
-static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
{
u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5838,12 +5811,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
* Return 1 if L1 wants to intercept an exit from L2. Only call this when in
* is_guest_mode (L2).
*/
-static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5962,7 +5936,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
unsigned long exit_qual;
u32 exit_intr_info;
@@ -5981,7 +5955,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -6007,7 +5981,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
exit_qual = vmx_get_exit_qual(vcpu);
reflect_vmexit:
- nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
+ nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
return true;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index cdf5f34518f4..d1df618cb7de 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return &counters[array_index_nospec(idx, num_counters)];
}
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- return false;
+ return 0;
- return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+ return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
}
static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
@@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+ /*
+ * As a first step, a guest could only enable LBR feature if its
+ * cpu model is the same as the host because the LBR registers
+ * would be pass-through to the guest and they're model specific.
+ */
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+
+ return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT);
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+ struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+ bool ret = false;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return ret;
+
+ ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+ (index >= records->from && index < records->from + records->nr) ||
+ (index >= records->to && index < records->to + records->nr);
+
+ if (!ret && records->info)
+ ret = (index >= records->info && index < records->info + records->nr);
+
+ return ret;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+ intel_pmu_is_valid_lbr_msr(vcpu, msr);
break;
}
@@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->event) {
+ perf_event_release_kernel(lbr_desc->event);
+ lbr_desc->event = NULL;
+ vcpu_to_pmu(vcpu)->event_count--;
+ }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event;
+
+ /*
+ * The perf_event_attr is constructed in the minimum efficient way:
+ * - set 'pinned = true' to make it task pinned so that if another
+ * cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+ * - set '.exclude_host = true' to record guest branches behavior;
+ *
+ * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+ * schedule the event without a real HW counter but a fake one;
+ * check is_guest_lbr_event() and __intel_get_event_constraints();
+ *
+ * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+ * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+ * event, which helps KVM to save/restore guest LBR records
+ * during host context switches and reduces quite a lot overhead,
+ * check branch_user_callstack() and intel_pmu_lbr_sched_task();
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .config = INTEL_FIXED_VLBR_EVENT,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .pinned = true,
+ .exclude_host = true,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (unlikely(lbr_desc->event)) {
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1,
+ current, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("%s: failed %ld\n",
+ __func__, PTR_ERR(event));
+ return -ENOENT;
+ }
+ lbr_desc->event = event;
+ pmu->event_count++;
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info, bool read)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ u32 index = msr_info->index;
+
+ if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+ return false;
+
+ if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu))
+ goto dummy;
+
+ /*
+ * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+ * host at the time the value is read from the msr, and this avoids the
+ * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+ * return 0 on guest reads.
+ */
+ local_irq_disable();
+ if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (read)
+ rdmsrl(index, msr_info->data);
+ else
+ wrmsrl(index, msr_info->data);
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+ return true;
+ }
+ clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+
+dummy:
+ if (read)
+ msr_info->data = 0;
+ return true;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
return 0;
- }
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
+ return 0;
}
return 1;
@@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reprogram_gp_counter(pmc, data);
return 0;
}
- }
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
+ return 0;
}
return 1;
@@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
@@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
- vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -340,8 +489,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
perf_get_x86_pmu_capability(&x86_pmu);
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
@@ -385,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+
+ if (intel_pmu_lbr_is_compatible(vcpu))
+ x86_perf_get_lbr(&lbr_desc->records);
+ else
+ lbr_desc->records.nr = 0;
+
+ if (lbr_desc->records.nr)
+ bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
@@ -405,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
pmu->fixed_counters[i].current_config = 0;
}
+
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
+ lbr_desc->records.nr = 0;
+ lbr_desc->event = NULL;
+ lbr_desc->msr_passthrough = false;
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -429,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
pmu->global_ovf_ctrl = 0;
+
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+ u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ u8 version = vcpu_to_pmu(vcpu)->version;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return;
+
+ if (version > 1 && version < 4)
+ intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+ int i;
+
+ for (i = 0; i < lbr->nr; i++) {
+ vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+ if (lbr->info)
+ vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+ }
+
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, true);
+ lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, false);
+ lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+ return;
+ }
+
+ if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ goto warn;
+ } else
+ vmx_enable_lbr_msrs_passthrough(vcpu);
+
+ return;
+
+warn:
+ pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
+ vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
}
struct kvm_pmu_ops intel_pmu_ops = {
@@ -445,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = {
.refresh = intel_pmu_refresh,
.init = intel_pmu_init,
.reset = intel_pmu_reset,
+ .deliver_pmi = intel_pmu_deliver_pmi,
+ .cleanup = intel_pmu_cleanup,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index f02962dcc72c..4831bc44ce66 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
dest = cpu_physical_id(cpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
@@ -104,7 +104,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
dest = cpu_physical_id(vcpu->cpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
@@ -174,7 +174,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
*/
dest = cpu_physical_id(vcpu->pre_pcpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index e85aa5faa22d..3a6461694fc2 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -44,7 +44,7 @@
* they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
* to vmx_vmexit.
*/
-SYM_FUNC_START(vmx_vmenter)
+SYM_FUNC_START_LOCAL(vmx_vmenter)
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index eb69fef57485..e0a3a9be654b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -50,6 +50,7 @@
#include "capabilities.h"
#include "cpuid.h"
#include "evmcs.h"
+#include "hyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
@@ -552,7 +553,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
- &vcpu->kvm->arch.hyperv.hv_pa_pg;
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
/*
* Synthetic VM-Exit is not enabled in current code and so All
* evmcs in singe VM shares same assist page.
@@ -658,6 +659,14 @@ static bool is_valid_passthrough_msr(u32 msr)
case MSR_IA32_RTIT_CR3_MATCH:
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ case MSR_LBR_SELECT:
+ case MSR_LBR_TOS:
+ case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
+ case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
+ case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
+ case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
+ case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
+ /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
return true;
}
@@ -806,7 +815,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
return *p;
}
-void update_exception_bitmap(struct kvm_vcpu *vcpu)
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
@@ -1102,7 +1111,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{
/* The base must be 128-byte aligned and a legal physical address. */
- return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
@@ -1577,7 +1586,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
#ifdef CONFIG_X86_64
@@ -1924,6 +1933,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
goto find_uret_msr;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ break;
default:
find_uret_msr:
msr = vmx_find_uret_msr(vmx, msr_info->index);
@@ -1947,6 +1959,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
+static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 debugctl = vmx_supported_debugctl();
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+
+ return debugctl;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -1997,14 +2019,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_DEBUGCTLMSR: {
+ u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
+ if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ }
+
+ if (invalid)
+ return 1;
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
-
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+ }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -2196,6 +2233,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data >> 32) != 0)
return 1;
goto find_uret_msr;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (data && !vcpu_to_pmu(vcpu)->version)
+ return 1;
+ if (data & PMU_CAP_LBR_FMT) {
+ if ((data & PMU_CAP_LBR_FMT) !=
+ (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ return 1;
+ if (!intel_pmu_lbr_is_compatible(vcpu))
+ return 1;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
default:
find_uret_msr:
@@ -2265,7 +2314,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
u64 msr;
cr4_set_bits(X86_CR4_VMXE);
- intel_pt_handle_vmx(1);
asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
@@ -2276,7 +2324,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
fault:
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- intel_pt_handle_vmx(0);
cr4_clear_bits(X86_CR4_VMXE);
return -EFAULT;
@@ -2299,9 +2346,13 @@ static int hardware_enable(void)
!hv_get_vp_assist_page(cpu))
return -EFAULT;
+ intel_pt_handle_vmx(1);
+
r = kvm_cpu_vmxon(phys_addr);
- if (r)
+ if (r) {
+ intel_pt_handle_vmx(0);
return r;
+ }
if (enable_ept)
ept_sync_global();
@@ -2319,22 +2370,14 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
- asm volatile (__ex("vmxoff"));
-
- intel_pt_handle_vmx(0);
- cr4_clear_bits(X86_CR4_VMXE);
-}
-
static void hardware_disable(void)
{
vmclear_local_loaded_vmcss();
- kvm_cpu_vmxoff();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ intel_pt_handle_vmx(0);
}
/*
@@ -2428,7 +2471,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC;
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
if (cpu_has_sgx())
opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
@@ -2739,7 +2783,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
@@ -2819,7 +2863,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
@@ -3774,7 +3818,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type, bool value)
{
if (value)
@@ -4269,6 +4313,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
ENABLE_USR_WAIT_PAUSE, false);
+ if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+ exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+
vmx->secondary_exec_control = exec_control;
}
@@ -4467,23 +4514,23 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
if (init_event)
vmx_clear_hlt(vcpu);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
- enable_irq_window(vcpu);
+ vmx_enable_irq_window(vcpu);
return;
}
@@ -4824,7 +4871,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
}
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
fallthrough;
case BP_VECTOR:
@@ -5049,6 +5096,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int dr, dr7, reg;
+ int err = 1;
exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
@@ -5057,9 +5105,9 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- /* Do not handle if the CPL > 0, will trigger GP on re-entry */
- if (!kvm_require_cpl(vcpu, 0))
- return 1;
+ if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ goto out;
+
dr7 = vmcs_readl(GUEST_DR7);
if (dr7 & DR7_GD) {
/*
@@ -5068,7 +5116,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
* guest debugging itself.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
vcpu->run->debug.arch.dr7 = dr7;
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
@@ -5096,14 +5144,15 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (exit_qualification & TYPE_MOV_FROM_DR) {
unsigned long val;
- if (kvm_get_dr(vcpu, dr, &val))
- return 1;
+ kvm_get_dr(vcpu, dr, &val);
kvm_register_write(vcpu, reg, val);
- } else
- if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
- return 1;
+ err = 0;
+ } else {
+ err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+ }
- return kvm_skip_emulated_instruction(vcpu);
+out:
+ return kvm_complete_insn_gp(vcpu, err);
}
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
@@ -5177,9 +5226,8 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
u64 new_bv = kvm_read_edx_eax(vcpu);
u32 index = kvm_rcx_read(vcpu);
- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- return kvm_skip_emulated_instruction(vcpu);
- return 1;
+ int err = kvm_set_xcr(vcpu, index, new_bv);
+ return kvm_complete_insn_gp(vcpu, err);
}
static int handle_apic_access(struct kvm_vcpu *vcpu)
@@ -5600,6 +5648,13 @@ static int handle_encls(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5656,6 +5711,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -5667,7 +5723,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
struct vcpu_vmx *vmx = to_vmx(vcpu);
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ if (!(vmx->exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -5908,11 +5964,12 @@ void dump_vmcs(void)
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ u16 exit_handler_index;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -5954,11 +6011,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 1;
}
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ if (exit_reason.failed_vmentry) {
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
+ = exit_reason.full;
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -5980,18 +6037,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* will cause infinite loop.
*/
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_PML_FULL &&
- exit_reason != EXIT_REASON_APIC_ACCESS &&
- exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.ndata++;
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -6023,42 +6080,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- if (exit_reason >= kvm_vmx_max_exit_handlers)
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
#ifdef CONFIG_RETPOLINE
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
- else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
- else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
return handle_interrupt_window(vcpu);
- else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
return handle_external_interrupt(vcpu);
- else if (exit_reason == EXIT_REASON_HLT)
+ else if (exit_reason.basic == EXIT_REASON_HLT)
return kvm_emulate_halt(vcpu);
- else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
return handle_ept_misconfig(vcpu);
#endif
- exit_reason = array_index_nospec(exit_reason,
- kvm_vmx_max_exit_handlers);
- if (!kvm_vmx_exit_handlers[exit_reason])
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
goto unexpected_vmexit;
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason.full);
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[0] = exit_reason.full;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+
+ /*
+ * Even when current exit reason is handled by KVM internally, we
+ * still need to exit to user space when bus lock detected to inform
+ * that there is a bus lock in guest.
+ */
+ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (ret > 0)
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+ }
+ return ret;
+}
+
/*
* Software based L1D cache flush which is used when microcode providing
* the cache control MSR is not loaded.
@@ -6129,7 +6206,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6373,9 +6450,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
- else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+ else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
}
@@ -6567,7 +6644,7 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- switch (to_vmx(vcpu)->exit_reason) {
+ switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
@@ -6577,8 +6654,6 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
}
}
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
@@ -6638,11 +6713,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
- fastpath_t exit_fastpath;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
-reenter_guest:
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6696,6 +6769,8 @@ reenter_guest:
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
+ if (intel_pmu_lbr_is_enabled(vcpu))
+ vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
@@ -6734,12 +6809,12 @@ reenter_guest:
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs))
+ if (static_branch_unlikely(&enable_evmcs)) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- if (static_branch_unlikely(&enable_evmcs))
- current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+ current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+ }
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
@@ -6768,17 +6843,17 @@ reenter_guest:
vmx->idt_vectoring_info = 0;
if (unlikely(vmx->fail)) {
- vmx->exit_reason = 0xdead;
+ vmx->exit_reason.full = 0xdead;
return EXIT_FASTPATH_NONE;
}
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
@@ -6790,22 +6865,7 @@ reenter_guest:
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
- exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
- if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
- if (!kvm_vcpu_exit_request(vcpu)) {
- /*
- * FIXME: this goto should be a loop in vcpu_enter_guest,
- * but it would incur the cost of a retpoline for now.
- * Revisit once static calls are available.
- */
- if (vcpu->arch.apicv_active)
- vmx_sync_pir_to_irr(vcpu);
- goto reenter_guest;
- }
- exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
- }
-
- return exit_fastpath;
+ return vmx_exit_handlers_fastpath(vcpu);
}
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -7256,7 +7316,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_cr4_guest_host_mask(vmx);
/* Refresh #PF interception to account for MAXPHYADDR changes. */
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
}
static __init void vmx_set_cpu_caps(void)
@@ -7546,7 +7606,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
return 0;
}
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
@@ -7606,7 +7666,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_exception_bitmap = update_exception_bitmap,
+ .update_exception_bitmap = vmx_update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
@@ -7649,9 +7709,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
.set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -7709,7 +7769,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.smi_allowed = vmx_smi_allowed,
.pre_enter_smm = vmx_pre_enter_smm,
.pre_leave_smm = vmx_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .enable_smi_window = vmx_enable_smi_window,
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -7810,6 +7870,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}
+ kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9d3a557949ac..12c53d05a902 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -70,6 +70,54 @@ struct pt_desc {
struct pt_ctx guest;
};
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
+#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
+
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu);
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -244,7 +292,7 @@ struct vcpu_vmx {
int vpid;
bool emulation_required;
- u32 exit_reason;
+ union vmx_exit_reason exit_reason;
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -279,6 +327,7 @@ struct vcpu_vmx {
u64 ept_pointer;
struct pt_desc pt_desc;
+ struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
@@ -329,7 +378,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
int root_level);
-void update_exception_bitmap(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
@@ -339,8 +388,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type, bool value);
static inline u8 vmx_get_rvi(void)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b967c1c774a1..884e5b3838c7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -29,6 +29,7 @@
#include "pmu.h"
#include "hyperv.h"
#include "lapic.h"
+#include "xen.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -114,11 +115,21 @@ static int sync_regs(struct kvm_vcpu *vcpu);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+#define KVM_X86_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
+ *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
+
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
-static bool __read_mostly report_ignored_msrs = true;
+bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
@@ -136,6 +147,8 @@ u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
u64 __read_mostly kvm_default_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
+bool __read_mostly kvm_has_bus_lock_exit;
+EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -234,7 +247,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
- VM_STAT("mmu_pte_updated", mmu_pte_updated),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
VM_STAT("mmu_flooded", mmu_flooded),
VM_STAT("mmu_recycled", mmu_recycled),
@@ -395,7 +407,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
- u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+ u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
@@ -484,19 +496,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
*/
vcpu->arch.dr6 &= ~DR_TRAP_BITS;
/*
- * DR6.RTM is set by all #DB exceptions that don't clear it.
+ * In order to reflect the #DB exception payload in guest
+ * dr6, three components need to be considered: active low
+ * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
+ * DR6_BS and DR6_BT)
+ * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
+ * In the target guest dr6:
+ * FIXED_1 bits should always be set.
+ * Active low bits should be cleared if 1-setting in payload.
+ * Active high bits should be set if 1-setting in payload.
+ *
+ * Note, the payload is compatible with the pending debug
+ * exceptions/exit qualification under VMX, that active_low bits
+ * are active high in payload.
+ * So they need to be flipped for DR6.
*/
- vcpu->arch.dr6 |= DR6_RTM;
+ vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
vcpu->arch.dr6 |= payload;
- /*
- * Bit 16 should be set in the payload whenever the #DB
- * exception should clear DR6.RTM. This makes the payload
- * compatible with the pending debug exceptions under VMX.
- * Though not currently documented in the SDM, this also
- * makes the payload compatible with the exit qualification
- * for #DB exceptions under VMX.
- */
- vcpu->arch.dr6 ^= payload & DR6_RTM;
+ vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
/*
* The #DB payload is defined as compatible with the 'pending
@@ -692,7 +709,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
+ if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -742,8 +759,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
- return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
- rsvd_bits(1, 2);
+ return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
}
/*
@@ -852,7 +868,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_pae(vcpu))
return 1;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
}
@@ -865,7 +881,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
- kvm_x86_ops.set_cr0(vcpu, cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
@@ -970,12 +986,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
- __kvm_set_xcr(vcpu, index, xcr)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
+ if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ return __kvm_set_xcr(vcpu, index, xcr);
+
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
@@ -987,7 +1001,7 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return false;
- return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
+ return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
}
EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
@@ -1031,7 +1045,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- kvm_x86_ops.set_cr4(vcpu, cr4);
+ static_call(kvm_x86_set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
@@ -1059,8 +1073,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
- if (is_long_mode(vcpu) &&
- (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+ if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1114,7 +1127,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- kvm_x86_ops.set_dr7(vcpu, dr7);
+ static_call(kvm_x86_set_dr7)(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1130,7 +1143,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
return fixed;
}
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
@@ -1143,13 +1156,13 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 4:
case 6:
if (!kvm_dr6_valid(val))
- return -1; /* #GP */
+ return 1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
break;
case 5:
default: /* 7 */
if (!kvm_dr7_valid(val))
- return -1; /* #GP */
+ return 1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
break;
@@ -1157,18 +1170,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
-
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- if (__kvm_set_dr(vcpu, dr, val)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
-}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
@@ -1185,7 +1189,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
*val = vcpu->arch.dr7;
break;
}
- return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1426,7 +1429,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- return kvm_x86_ops.get_msr_feature(msr);
+ return static_call(kvm_x86_get_msr_feature)(msr);
}
return 0;
}
@@ -1502,7 +1505,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- r = kvm_x86_ops.set_efer(vcpu, efer);
+ r = static_call(kvm_x86_set_efer)(vcpu, efer);
if (r) {
WARN_ON(r > 0);
return r;
@@ -1599,7 +1602,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
msr.index = index;
msr.host_initiated = host_initiated;
- return kvm_x86_ops.set_msr(vcpu, &msr);
+ return static_call(kvm_x86_set_msr)(vcpu, &msr);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
@@ -1632,7 +1635,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
msr.index = index;
msr.host_initiated = host_initiated;
- ret = kvm_x86_ops.get_msr(vcpu, &msr);
+ ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
@@ -1673,12 +1676,12 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
- return kvm_x86_ops.complete_emulated_msr(vcpu, err);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
}
static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
static u64 kvm_msr_reason(int r)
@@ -1750,7 +1753,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read_ex(ecx);
}
- return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -1776,17 +1779,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
else
trace_kvm_msr_write_ex(ecx, data);
- return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
/*
* The fast path for frequent and performance sensitive wrmsr emulation,
@@ -1936,15 +1938,14 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
int version;
int r;
struct pvclock_wall_clock wc;
+ u32 wc_sec_hi;
u64 wall_nsec;
- kvm->arch.wall_clock = wall_clock;
-
if (!wall_clock)
return;
@@ -1973,6 +1974,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+ if (sec_hi_ofs) {
+ wc_sec_hi = wall_nsec >> 32;
+ kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+ &wc_sec_hi, sizeof(wc_sec_hi));
+ }
+
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
@@ -2209,7 +2216,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
vcpu->arch.l1_tsc_offset = offset;
- vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -2592,13 +2599,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return ret;
}
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
+ struct gfn_to_hva_cache *cache,
+ unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
+ if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
+ &guest_hv_clock, offset, sizeof(guest_hv_clock))))
return;
/* This VCPU is paused, but it's legal for a guest to read another
@@ -2621,9 +2630,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
++guest_hv_clock.version; /* first time write, random junk */
vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock.version));
smp_wmb();
@@ -2637,16 +2646,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2733,7 +2742,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time_enabled)
- kvm_setup_pvclock_page(v);
+ kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_set)
+ kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_set)
+ kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
if (v == kvm_get_vcpu(v->kvm, 0))
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
@@ -2858,32 +2872,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- int lm = is_long_mode(vcpu);
- u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
- : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
- u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
- : kvm->arch.xen_hvm_config.blob_size_32;
- u32 page_num = data & ~PAGE_MASK;
- u64 page_addr = data & PAGE_MASK;
- u8 *page;
-
- if (page_num >= blob_size)
- return 1;
-
- page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
- kfree(page);
- return 1;
- }
- return 0;
-}
-
static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
{
u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
@@ -2955,13 +2943,13 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_all(vcpu);
+ static_call(kvm_x86_tlb_flush_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_guest(vcpu);
+ static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
@@ -3017,6 +3005,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u32 msr = msr_info->index;
u64 data = msr_info->data;
+ if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+ return kvm_xen_write_hypercall_page(vcpu, data);
+
switch (msr) {
case MSR_AMD64_NB_CFG:
case MSR_IA32_UCODE_WRITE:
@@ -3073,18 +3064,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!data) {
- /* We support the non-activated case already */
- break;
- } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
- /* Values other than LBR and BTF are vendor-specific,
- thus reserved and should throw a #GP */
- return 1;
- } else if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
- break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
@@ -3153,13 +3132,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
return 1;
- kvm_write_wall_clock(vcpu->kvm, data);
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_WALL_CLOCK:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
return 1;
- kvm_write_wall_clock(vcpu->kvm, data);
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
@@ -3304,8 +3285,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.msr_misc_features_enables = data;
break;
default:
- if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
- return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
return KVM_MSR_RET_INVALID;
@@ -3357,7 +3336,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
- case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
@@ -3739,7 +3717,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
- case KVM_CAP_XEN_HVM:
case KVM_CAP_VCPU_EVENTS:
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
@@ -3779,6 +3756,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
r = 1;
break;
+ case KVM_CAP_XEN_HVM:
+ r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO;
+ break;
case KVM_CAP_SYNC_REGS:
r = KVM_SYNC_X86_VALID_FIELDS;
break;
@@ -3800,10 +3782,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
+ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
- r = !kvm_x86_ops.cpu_has_accelerated_tpr();
+ r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
break;
case KVM_CAP_NR_VCPUS:
r = KVM_SOFT_MAX_VCPUS;
@@ -3845,6 +3827,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_STEAL_TIME:
r = sched_info_on();
break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ if (kvm_has_bus_lock_exit)
+ r = KVM_BUS_LOCK_DETECTION_OFF |
+ KVM_BUS_LOCK_DETECTION_EXIT;
+ else
+ r = 0;
+ break;
default:
break;
}
@@ -3962,14 +3951,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops.has_wbinvd_exit())
+ if (static_call(kvm_x86_has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- kvm_x86_ops.vcpu_load(vcpu, cpu);
+ static_call(kvm_x86_vcpu_load)(vcpu, cpu);
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -4015,6 +4004,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
+ int idx;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4022,9 +4012,15 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
&vcpu->arch.st.cache, true))
- return;
+ goto out;
st = map.hva +
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4032,33 +4028,18 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+
+out:
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- int idx;
-
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
- /*
- * Disable page faults because we're in atomic context here.
- * kvm_write_guest_offset_cached() would call might_fault()
- * that relies on pagefault_disable() to tell if there's a
- * bug. NOTE: the write to guest memory may not go through if
- * during postcopy live migration or if there's heavy guest
- * paging.
- */
- pagefault_disable();
- /*
- * kvm_memslots() will be called by
- * kvm_write_guest_offset_cached() so take the srcu lock.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- pagefault_enable();
- kvm_x86_ops.vcpu_put(vcpu);
+ static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
* If userspace has set any breakpoints or watchpoints, dr6 is restored
@@ -4072,7 +4053,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -4182,7 +4163,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
- kvm_x86_ops.setup_mce(vcpu);
+ static_call(kvm_x86_setup_mce)(vcpu);
out:
return r;
}
@@ -4289,11 +4270,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+ events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
- events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
+ events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
events->nmi.pad = 0;
events->sipi_vector = 0; /* never valid when reporting to user space */
@@ -4360,13 +4341,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops.set_interrupt_shadow(vcpu,
- events->interrupt.shadow);
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
vcpu->arch.nmi_pending = events->nmi.pending;
- kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
+ static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -4422,9 +4403,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
- if (dbgregs->dr6 & ~0xffffffffull)
+ if (!kvm_dr6_valid(dbgregs->dr6))
return -EINVAL;
- if (dbgregs->dr7 & ~0xffffffffull)
+ if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
@@ -4661,7 +4642,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.enable_direct_tlbflush)
return -ENOTTY;
- return kvm_x86_ops.enable_direct_tlbflush(vcpu);
+ return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
@@ -5032,6 +5013,26 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
break;
+ case KVM_XEN_VCPU_GET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_get_attr(vcpu, &xva);
+ if (!r && copy_to_user(argp, &xva, sizeof(xva)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_VCPU_SET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_set_attr(vcpu, &xva);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -5053,14 +5054,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = kvm_x86_ops.set_tss_addr(kvm, addr);
+ ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
+ return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -5217,8 +5218,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
*/
- if (kvm_x86_ops.flush_log_dirty)
- kvm_x86_ops.flush_log_dirty(kvm);
+ static_call_cond(kvm_x86_flush_log_dirty)(kvm);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -5308,6 +5308,20 @@ split_irqchip_unlock:
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
+ break;
+
+ if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
+ (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
+ break;
+
+ if (kvm_has_bus_lock_exit &&
+ cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
+ kvm->arch.bus_lock_detection_enabled = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5637,11 +5651,27 @@ set_pit2_out:
r = -EFAULT;
if (copy_from_user(&xhc, argp, sizeof(xhc)))
goto out;
- r = -EINVAL;
- if (xhc.flags)
+ r = kvm_xen_hvm_config(kvm, &xhc);
+ break;
+ }
+ case KVM_XEN_HVM_GET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
goto out;
- memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
- r = 0;
+ r = kvm_xen_hvm_get_attr(kvm, &xha);
+ if (!r && copy_to_user(argp, &xha, sizeof(xha)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_HVM_SET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_set_attr(kvm, &xha);
break;
}
case KVM_SET_CLOCK: {
@@ -5686,7 +5716,7 @@ set_pit2_out:
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_op)
- r = kvm_x86_ops.mem_enc_op(kvm, argp);
+ r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -5698,7 +5728,7 @@ set_pit2_out:
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_reg_region)
- r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
+ r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -5710,7 +5740,7 @@ set_pit2_out:
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_unreg_region)
- r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
+ r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -5812,7 +5842,7 @@ static void kvm_init_msr_list(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
+ if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -5875,13 +5905,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
static void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops.set_segment(vcpu, var, seg);
+ static_call(kvm_x86_set_segment)(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops.get_segment(vcpu, var, seg);
+ static_call(kvm_x86_get_segment)(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -5901,14 +5931,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5916,7 +5946,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5965,7 +5995,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -5990,7 +6020,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -6011,7 +6041,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = 0;
- if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+ if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -6064,7 +6094,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = PFERR_WRITE_MASK;
- if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+ if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -6089,7 +6119,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
- if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@@ -6123,7 +6153,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -6531,7 +6561,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return kvm_x86_ops.get_segment_base(vcpu, seg);
+ return static_call(kvm_x86_get_segment_base)(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -6544,7 +6574,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (kvm_x86_ops.has_wbinvd_exit()) {
+ if (static_call(kvm_x86_has_wbinvd_exit)()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -6571,17 +6601,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
{
- return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value)
{
- return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+ return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -6649,27 +6679,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
+ return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -6811,7 +6841,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
+ return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
&ctxt->exception);
}
@@ -6849,7 +6879,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
+ static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
@@ -6865,7 +6895,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+ return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
}
static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
@@ -6927,7 +6957,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+ u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -6938,7 +6968,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -6980,7 +7010,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
@@ -7041,7 +7071,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -7101,9 +7131,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (vcpu->arch.mmu->direct_map) {
unsigned int indirect_shadow_pages;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
if (indirect_shadow_pages)
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -7210,7 +7240,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7222,10 +7252,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
int r;
- r = kvm_x86_ops.skip_emulated_instruction(vcpu);
+ r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
if (unlikely(!r))
return 0;
@@ -7254,7 +7284,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.eff_db);
if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7311,6 +7341,42 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+/*
+ * Decode to be emulated instruction. Return EMULATION_OK if success.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
+{
+ int r = EMULATION_OK;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since we do not set
+ * complete_userspace_io. This does not handle watchpoints yet,
+ * those would be handled in the emulate_ops.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len)
{
@@ -7319,7 +7385,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
- if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -7330,32 +7396,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*/
write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
- kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
-
- /*
- * We will reenter on the same instruction since
- * we do not set complete_userspace_io. This does not
- * handle watchpoints yet, those would be handled in
- * the emulate_ops.
- */
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
- return r;
+ kvm_clear_exception_queue(vcpu);
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
-
- trace_kvm_emulate_insn_start(vcpu);
- ++vcpu->stat.insn_emulation;
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
if (r != EMULATION_OK) {
if ((emulation_type & EMULTYPE_TRAP_UD) ||
(emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
@@ -7462,7 +7508,7 @@ restart:
r = 1;
if (writeback) {
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
@@ -7471,7 +7517,7 @@ restart:
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
if (kvm_x86_ops.update_emulated_instruction)
- kvm_x86_ops.update_emulated_instruction(vcpu);
+ static_call(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -7800,7 +7846,7 @@ static int kvm_is_user_mode(void)
int user_mode = 3;
if (__this_cpu_read(current_vcpu))
- user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
+ user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
@@ -7945,7 +7991,6 @@ int kvm_arch_init(void *opaque)
supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
- kvm_lapic_init();
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
#ifdef CONFIG_X86_64
@@ -7987,6 +8032,7 @@ void kvm_arch_exit(void)
kvm_mmu_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_fpu_cache);
+ WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
}
static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
@@ -8038,7 +8084,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
- if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+ if (!kvm_get_walltime_and_clockread(&ts, &cycle))
return -KVM_EOPNOTSUPP;
clock_pairing.sec = ts.tv_sec;
@@ -8114,7 +8160,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit;
- if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
nr = kvm_rax_read(vcpu);
@@ -8134,7 +8183,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
- if (kvm_x86_ops.get_cpl(vcpu) != 0) {
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
@@ -8191,7 +8240,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
- kvm_x86_ops.patch_hypercall(vcpu, instruction);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -8215,12 +8264,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->if_flag = !vcpu->arch.guest_state_protected
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
+
+ if (is_smm(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_SMM;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -8246,7 +8297,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
tpr = kvm_lapic_get_cr8(vcpu);
- kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
+ static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
@@ -8257,7 +8308,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- kvm_x86_ops.queue_exception(vcpu);
+ static_call(kvm_x86_queue_exception)(vcpu);
can_inject = false;
}
/*
@@ -8276,10 +8327,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- kvm_x86_ops.set_nmi(vcpu);
+ static_call(kvm_x86_set_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- kvm_x86_ops.set_irq(vcpu);
+ static_call(kvm_x86_set_irq)(vcpu);
can_inject = false;
}
}
@@ -8320,7 +8371,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
}
}
- kvm_x86_ops.queue_exception(vcpu);
+ static_call(kvm_x86_queue_exception)(vcpu);
can_inject = false;
}
@@ -8336,7 +8387,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
* The kvm_x86_ops hooks communicate this by returning -EBUSY.
*/
if (vcpu->arch.smi_pending) {
- r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
@@ -8345,35 +8396,35 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
enter_smm(vcpu);
can_inject = false;
} else
- kvm_x86_ops.enable_smi_window(vcpu);
+ static_call(kvm_x86_enable_smi_window)(vcpu);
}
if (vcpu->arch.nmi_pending) {
- r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- kvm_x86_ops.set_nmi(vcpu);
+ static_call(kvm_x86_set_nmi)(vcpu);
can_inject = false;
- WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+ WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
if (vcpu->arch.nmi_pending)
- kvm_x86_ops.enable_nmi_window(vcpu);
+ static_call(kvm_x86_enable_nmi_window)(vcpu);
}
if (kvm_cpu_has_injectable_intr(vcpu)) {
- r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- kvm_x86_ops.set_irq(vcpu);
- WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+ static_call(kvm_x86_set_irq)(vcpu);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
- kvm_x86_ops.enable_irq_window(vcpu);
+ static_call(kvm_x86_enable_irq_window)(vcpu);
}
if (is_guest_mode(vcpu) &&
@@ -8398,7 +8449,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* If an NMI is already in progress, limit further NMIs to just one.
* Otherwise, allow two (and we'll inject the first one immediately).
*/
- if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
@@ -8488,11 +8539,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
@@ -8542,7 +8593,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
@@ -8552,7 +8603,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
@@ -8582,30 +8633,30 @@ static void enter_smm(struct kvm_vcpu *vcpu)
* vCPU state (e.g. leave guest mode) after we've saved the state into
* the SMM state-save area.
*/
- kvm_x86_ops.pre_enter_smm(vcpu, buf);
+ static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
- if (kvm_x86_ops.get_nmi_mask(vcpu))
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- kvm_x86_ops.set_nmi_mask(vcpu, true);
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- kvm_x86_ops.set_cr0(vcpu, cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_x86_ops.set_cr4(vcpu, 0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- kvm_x86_ops.set_idt(vcpu, &dt);
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
- __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
cs.base = vcpu->arch.smbase;
@@ -8634,7 +8685,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- kvm_x86_ops.set_efer(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
#endif
kvm_update_cpuid_runtime(vcpu);
@@ -8672,7 +8723,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
kvm_apic_update_apicv(vcpu);
- kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
+ static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -8689,7 +8740,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
unsigned long old, new, expected;
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
+ !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
@@ -8709,7 +8760,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
trace_kvm_apicv_update_request(activate, bit);
if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
- kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
+ static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
/*
* Sending request to update APICV for all other vcpus,
@@ -8735,7 +8786,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -8753,9 +8804,12 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
- vcpu_to_synic(vcpu)->vec_bitmap, 256);
- kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ if (to_hv_vcpu(vcpu))
+ bitmap_or((ulong *)eoi_exit_bitmap,
+ vcpu->arch.ioapic_handled_vectors,
+ to_hv_synic(vcpu)->vec_bitmap, 256);
+
+ static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -8780,7 +8834,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops.set_apic_access_page_addr)
return;
- kvm_x86_ops.set_apic_access_page_addr(vcpu);
+ static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -8905,8 +8959,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ vcpu->run->hyperv = hv_vcpu->exit;
r = 0;
goto out;
}
@@ -8923,10 +8979,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_ops.msr_filter_changed(vcpu);
+ static_call(kvm_x86_msr_filter_changed)(vcpu);
}
- if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
+ kvm_xen_has_interrupt(vcpu)) {
++vcpu->stat.req_event;
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -8936,7 +8993,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
inject_pending_event(vcpu, &req_immediate_exit);
if (req_int_win)
- kvm_x86_ops.enable_irq_window(vcpu);
+ static_call(kvm_x86_enable_irq_window)(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -8951,7 +9008,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- kvm_x86_ops.prepare_guest_switch(vcpu);
+ static_call(kvm_x86_prepare_guest_switch)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -8982,7 +9039,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* notified with kvm_vcpu_kick.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -8996,7 +9053,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops.request_immediate_exit(vcpu);
+ static_call(kvm_x86_request_immediate_exit)(vcpu);
}
fpregs_assert_state_consistent();
@@ -9013,7 +9070,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- exit_fastpath = kvm_x86_ops.run(vcpu);
+ for (;;) {
+ exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+ break;
+ }
+
+ if (vcpu->arch.apicv_active)
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ }
/*
* Do this here before restoring debug registers on the host. And
@@ -9023,7 +9092,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- kvm_x86_ops.sync_dirty_debug_regs(vcpu);
+ static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
@@ -9045,7 +9114,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops.handle_exit_irqoff(vcpu);
+ static_call(kvm_x86_handle_exit_irqoff)(vcpu);
/*
* Consume any pending interrupts, including the possible source of
@@ -9087,13 +9156,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
+ r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
return r;
cancel_injection:
if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops.cancel_injection(vcpu);
+ static_call(kvm_x86_cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -9103,13 +9172,13 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
+ (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops.post_block)
- kvm_x86_ops.post_block(vcpu);
+ static_call(kvm_x86_post_block)(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -9330,6 +9399,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
+ kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
@@ -9504,10 +9574,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -9625,7 +9695,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
*/
if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
return false;
- if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
+ if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
return false;
} else {
/*
@@ -9660,10 +9730,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- kvm_x86_ops.set_idt(vcpu, &dt);
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- kvm_x86_ops.set_gdt(vcpu, &dt);
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
@@ -9673,14 +9743,14 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_ops.set_efer(vcpu, sregs->efer);
+ static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
+ static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9788,7 +9858,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops.update_exception_bitmap(vcpu);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
r = 0;
@@ -9966,7 +10036,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm))
vcpu->arch.apicv_active = true;
} else
- static_key_slow_inc(&kvm_no_apic_vcpu);
+ static_branch_inc(&kvm_has_noapic_vcpu);
r = -ENOMEM;
@@ -10004,7 +10074,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
@@ -10014,9 +10084,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
- kvm_hv_vcpu_init(vcpu);
-
- r = kvm_x86_ops.vcpu_create(vcpu);
+ r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
@@ -10052,8 +10120,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- kvm_hv_vcpu_postcreate(vcpu);
-
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
@@ -10079,7 +10145,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
- kvm_x86_ops.vcpu_free(vcpu);
+ static_call(kvm_x86_vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -10096,7 +10162,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
+ static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -10115,7 +10181,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
- vcpu->arch.dr6 = DR6_INIT;
+ vcpu->arch.dr6 = DR6_ACTIVE_LOW;
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -10168,7 +10234,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.ia32_xss = 0;
- kvm_x86_ops.vcpu_reset(vcpu, init_event);
+ static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -10194,7 +10260,7 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
- ret = kvm_x86_ops.hardware_enable();
+ ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -10276,7 +10342,7 @@ int kvm_arch_hardware_enable(void)
void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops.hardware_disable();
+ static_call(kvm_x86_hardware_disable)();
drop_user_return_notifiers();
}
@@ -10295,6 +10361,7 @@ int kvm_arch_hardware_setup(void *opaque)
return r;
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+ kvm_ops_static_call_update();
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
@@ -10323,7 +10390,7 @@ int kvm_arch_hardware_setup(void *opaque)
void kvm_arch_hardware_unsetup(void)
{
- kvm_x86_ops.hardware_unsetup();
+ static_call(kvm_x86_hardware_unsetup)();
}
int kvm_arch_check_processor_compat(void *opaque)
@@ -10351,8 +10418,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-struct static_key kvm_no_apic_vcpu __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
@@ -10363,12 +10430,12 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
pmu->need_cleanup = true;
kvm_make_request(KVM_REQ_PMU, vcpu);
}
- kvm_x86_ops.sched_in(vcpu, cpu);
+ static_call(kvm_x86_sched_in)(vcpu, cpu);
}
void kvm_arch_free_vm(struct kvm *kvm)
{
- kfree(kvm->arch.hyperv.hv_pa_pg);
+ kfree(to_kvm_hv(kvm)->hv_pa_pg);
vfree(kvm);
}
@@ -10407,7 +10474,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
- return kvm_x86_ops.vm_init(kvm);
+ return static_call(kvm_x86_vm_init)(kvm);
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -10552,8 +10619,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
- if (kvm_x86_ops.vm_destroy)
- kvm_x86_ops.vm_destroy(kvm);
+ static_call_cond(kvm_x86_vm_destroy)(kvm);
for (i = 0; i < kvm->arch.msr_filter.count; i++)
kfree(kvm->arch.msr_filter.ranges[i].bitmap);
kvm_pic_destroy(kvm);
@@ -10563,6 +10629,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
+ kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
}
@@ -10744,7 +10811,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*/
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
if (kvm_x86_ops.slot_enable_log_dirty) {
- kvm_x86_ops.slot_enable_log_dirty(kvm, new);
+ static_call(kvm_x86_slot_enable_log_dirty)(kvm, new);
} else {
int level =
kvm_dirty_log_manual_protect_and_init_set(kvm) ?
@@ -10761,8 +10828,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, new, level);
}
} else {
- if (kvm_x86_ops.slot_disable_log_dirty)
- kvm_x86_ops.slot_disable_log_dirty(kvm, new);
+ static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new);
}
}
@@ -10801,7 +10867,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
kvm_x86_ops.guest_apic_has_interrupt &&
- kvm_x86_ops.guest_apic_has_interrupt(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -10820,12 +10886,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- kvm_x86_ops.nmi_allowed(vcpu, false)))
+ static_call(kvm_x86_nmi_allowed)(vcpu, false)))
return true;
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
- kvm_x86_ops.smi_allowed(vcpu, false)))
+ static_call(kvm_x86_smi_allowed)(vcpu, false)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10859,7 +10925,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
return true;
return false;
@@ -10877,7 +10943,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.interrupt_allowed(vcpu, false);
+ return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10903,7 +10969,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
- rflags = kvm_x86_ops.get_rflags(vcpu);
+ rflags = static_call(kvm_x86_get_rflags)(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -10915,7 +10981,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- kvm_x86_ops.set_rflags(vcpu, rflags);
+ static_call(kvm_x86_set_rflags)(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -11045,7 +11111,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
return false;
if (!kvm_pv_async_pf_enabled(vcpu) ||
- (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
+ (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
return false;
return true;
@@ -11190,7 +11256,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+ ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -11215,7 +11281,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -11226,7 +11292,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_vector_hashing_enabled(void)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 0f727b50bd3d..39eb04887141 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -98,7 +98,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
if (!is_long_mode(vcpu))
return false;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
@@ -129,7 +129,7 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_current(vcpu);
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
}
static inline int is_pae(struct kvm_vcpu *vcpu)
@@ -244,9 +244,10 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
{
- return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
+ return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
}
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -273,6 +274,8 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
@@ -296,6 +299,8 @@ extern int pi_inject_timer;
extern struct static_key kvm_no_apic_vcpu;
+extern bool report_ignored_msrs;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -391,7 +396,6 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644
index 000000000000..af8f6562fce4
--- /dev/null
+++ b/arch/x86/kvm/xen.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#include "x86.h"
+#include "xen.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+
+#include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
+
+#include "trace.h"
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+ gpa_t gpa = gfn_to_gpa(gfn);
+ int wc_ofs, sec_hi_ofs;
+ int ret;
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
+ gpa, PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ kvm->arch.xen.shinfo_set = true;
+
+ /* Paranoia checks on the 32-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ /* 32-bit location by default */
+ wc_ofs = offsetof(struct compat_shared_info, wc);
+ sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
+
+#ifdef CONFIG_X86_64
+ /* Paranoia checks on the 64-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
+ BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
+
+ if (kvm->arch.xen.long_mode) {
+ wc_ofs = offsetof(struct shared_info, wc);
+ sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
+ }
+#endif
+
+ kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ u8 rc = 0;
+
+ /*
+ * If the global upcall vector (HVMIRQ_callback_vector) is set and
+ * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
+ */
+ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
+ struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
+
+ /* No need for compat handling here */
+ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
+ offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+
+ /*
+ * For efficiency, this mirrors the checks for using the valid
+ * cache in kvm_read_guest_offset_cached(), but just uses
+ * __get_user() instead. And falls back to the slow path.
+ */
+ if (likely(slots->generation == ghc->generation &&
+ !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+ /* Fast path */
+ __get_user(rc, (u8 __user *)ghc->hva + offset);
+ } else {
+ /* Slow path */
+ kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+ sizeof(rc));
+ }
+
+ return rc;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
+ r = -EINVAL;
+ } else {
+ kvm->arch.xen.long_mode = !!data->u.long_mode;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (data->u.shared_info.gfn == GPA_INVALID) {
+ kvm->arch.xen.shinfo_set = false;
+ r = 0;
+ break;
+ }
+ r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ break;
+
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ kvm->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ data->u.long_mode = kvm->arch.xen.long_mode;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (kvm->arch.xen.shinfo_set)
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = kvm->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int idx, r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ /* No compat necessary here. */
+ BUILD_BUG_ON(sizeof(struct vcpu_info) !=
+ sizeof(struct compat_vcpu_info));
+
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.vcpu_info_set = false;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa,
+ sizeof(struct vcpu_info));
+ if (!r) {
+ vcpu->arch.xen.vcpu_info_set = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.vcpu_time_info_set = false;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
+ if (!r) {
+ vcpu->arch.xen.vcpu_time_info_set = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ if (vcpu->arch.xen.vcpu_info_set)
+ data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
+ else
+ data->u.gpa = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (vcpu->arch.xen.vcpu_time_info_set)
+ data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
+ else
+ data->u.gpa = GPA_INVALID;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ bool lm = is_long_mode(vcpu);
+
+ /* Latch long_mode for shared_info pages etc. */
+ vcpu->kvm->arch.xen.long_mode = lm;
+
+ /*
+ * If Xen hypercall intercept is enabled, fill the hypercall
+ * page with VMCALL/VMMCALL instructions since that's what
+ * we catch. Else the VMM has provided the hypercall pages
+ * with instructions of its own choosing, so use those.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ u8 instructions[32];
+ int i;
+
+ if (page_num)
+ return 1;
+
+ /* mov imm32, %eax */
+ instructions[0] = 0xb8;
+
+ /* vmcall / vmmcall */
+ kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+
+ /* ret */
+ instructions[8] = 0xc3;
+
+ /* int3 to pad */
+ memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+ for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+ *(u32 *)&instructions[1] = i;
+ if (kvm_vcpu_write_guest(vcpu,
+ page_addr + (i * sizeof(instructions)),
+ instructions, sizeof(instructions)))
+ return 1;
+ }
+ } else {
+ /*
+ * Note, truncation is a non-issue as 'lm' is guaranteed to be
+ * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
+ */
+ hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
+ : kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u8 *page;
+
+ if (page_num >= blob_size)
+ return 1;
+
+ blob_addr += page_num * PAGE_SIZE;
+
+ page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+ kfree(page);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
+{
+ if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+ return -EINVAL;
+
+ /*
+ * With hypercall interception the kernel generates its own
+ * hypercall page so it must not be provided.
+ */
+ if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
+ (xhc->blob_addr_32 || xhc->blob_addr_64 ||
+ xhc->blob_size_32 || xhc->blob_size_64))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
+ static_branch_inc(&kvm_xen_enabled.key);
+ else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+
+ memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
+
+static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_rax_write(vcpu, result);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
+ return 1;
+
+ return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+}
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
+{
+ bool longmode;
+ u64 input, params[6];
+
+ input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+ /* Hyper-V hypercalls get bit 31 set in EAX */
+ if ((input & 0x80000000) &&
+ kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ longmode = is_64_bit_mode(vcpu);
+ if (!longmode) {
+ params[0] = (u32)kvm_rbx_read(vcpu);
+ params[1] = (u32)kvm_rcx_read(vcpu);
+ params[2] = (u32)kvm_rdx_read(vcpu);
+ params[3] = (u32)kvm_rsi_read(vcpu);
+ params[4] = (u32)kvm_rdi_read(vcpu);
+ params[5] = (u32)kvm_rbp_read(vcpu);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ params[0] = (u64)kvm_rdi_read(vcpu);
+ params[1] = (u64)kvm_rsi_read(vcpu);
+ params[2] = (u64)kvm_rdx_read(vcpu);
+ params[3] = (u64)kvm_r10_read(vcpu);
+ params[4] = (u64)kvm_r8_read(vcpu);
+ params[5] = (u64)kvm_r9_read(vcpu);
+ }
+#endif
+ trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ params[3], params[4], params[5]);
+
+ vcpu->run->exit_reason = KVM_EXIT_XEN;
+ vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+ vcpu->run->xen.u.hcall.longmode = longmode;
+ vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.input = input;
+ vcpu->run->xen.u.hcall.params[0] = params[0];
+ vcpu->run->xen.u.hcall.params[1] = params[1];
+ vcpu->run->xen.u.hcall.params[2] = params[2];
+ vcpu->run->xen.u.hcall.params[3] = params[3];
+ vcpu->run->xen.u.hcall.params[4] = params[4];
+ vcpu->run->xen.u.hcall.params[5] = params[5];
+ vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io =
+ kvm_xen_hypercall_complete_userspace;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
new file mode 100644
index 000000000000..b66a921776f4
--- /dev/null
+++ b/arch/x86/kvm/xen.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#ifndef __ARCH_X86_KVM_XEN_H__
+#define __ARCH_X86_KVM_XEN_H__
+
+#include <linux/jump_label_ratelimit.h>
+
+extern struct static_key_false_deferred kvm_xen_enabled;
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ (kvm->arch.xen_hvm_config.flags &
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+ return __kvm_xen_has_interrupt(vcpu);
+
+ return 0;
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+#include <asm/pvclock-abi.h>
+#include <asm/xen/interface.h>
+
+struct compat_arch_vcpu_info {
+ unsigned int cr2;
+ unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ uint16_t pad;
+ uint32_t evtchn_pending_sel;
+ struct compat_arch_vcpu_info arch;
+ struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+ unsigned int max_pfn;
+ unsigned int pfn_to_mfn_frame_list_list;
+ unsigned int nmi_reason;
+ unsigned int p2m_cr3;
+ unsigned int p2m_vaddr;
+ unsigned int p2m_generation;
+ uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+ struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS];
+ uint32_t evtchn_pending[32];
+ uint32_t evtchn_mask[32];
+ struct pvclock_wall_clock wc;
+ struct compat_arch_shared_info arch;
+};
+
+#endif /* __ARCH_X86_KVM_XEN_H__ */