diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-04 01:13:47 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-04 01:13:47 +0300 |
commit | 039aeb9deb9291f3b19c375a8bc6fa7f768996cc (patch) | |
tree | d98d5ddf276843995aa214157b587bb88270c530 /arch/x86 | |
parent | 6b2591c21273ebf65c13dae5d260ce88f0f197dd (diff) | |
parent | 13ffbd8db1dd43d63d086517872a4e702a6bf309 (diff) | |
download | linux-039aeb9deb9291f3b19c375a8bc6fa7f768996cc.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Move the arch-specific code into arch/arm64/kvm
- Start the post-32bit cleanup
- Cherry-pick a few non-invasive pre-NV patches
x86:
- Rework of TLB flushing
- Rework of event injection, especially with respect to nested
virtualization
- Nested AMD event injection facelift, building on the rework of
generic code and fixing a lot of corner cases
- Nested AMD live migration support
- Optimization for TSC deadline MSR writes and IPIs
- Various cleanups
- Asynchronous page fault cleanups (from tglx, common topic branch
with tip tree)
- Interrupt-based delivery of asynchronous "page ready" events (host
side)
- Hyper-V MSRs and hypercalls for guest debugging
- VMX preemption timer fixes
s390:
- Cleanups
Generic:
- switch vCPU thread wakeup from swait to rcuwait
The other architectures, and the guest side of the asynchronous page
fault work, will come next week"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (256 commits)
KVM: selftests: fix rdtsc() for vmx_tsc_adjust_test
KVM: check userspace_addr for all memslots
KVM: selftests: update hyperv_cpuid with SynDBG tests
x86/kvm/hyper-v: Add support for synthetic debugger via hypercalls
x86/kvm/hyper-v: enable hypercalls regardless of hypercall page
x86/kvm/hyper-v: Add support for synthetic debugger interface
x86/hyper-v: Add synthetic debugger definitions
KVM: selftests: VMX preemption timer migration test
KVM: nVMX: Fix VMX preemption timer migration
x86/kvm/hyper-v: Explicitly align hcall param for kvm_hyperv_exit
KVM: x86/pmu: Support full width counting
KVM: x86/pmu: Tweak kvm_pmu_get_msr to pass 'struct msr_data' in
KVM: x86: announce KVM_FEATURE_ASYNC_PF_INT
KVM: x86: acknowledgment mechanism for async pf page ready notifications
KVM: x86: interrupt based APF 'page ready' event delivery
KVM: introduce kvm_read_guest_offset_cached()
KVM: rename kvm_arch_can_inject_async_page_present() to kvm_arch_can_dequeue_async_page_present()
KVM: x86: extend struct kvm_vcpu_pv_apf_data with token info
Revert "KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously"
KVM: VMX: Replace zero-length array with flexible-array
...
Diffstat (limited to 'arch/x86')
52 files changed, 2980 insertions, 1741 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ac232f456396..a5eed844e948 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1691,14 +1691,6 @@ SYM_CODE_START(general_protection) jmp common_exception SYM_CODE_END(general_protection) -#ifdef CONFIG_KVM_GUEST -SYM_CODE_START(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(async_page_fault) -#endif - SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 64fe3d82157e..eead1e2bebd5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1202,10 +1202,6 @@ idtentry xendebug do_debug has_error_code=0 idtentry general_protection do_general_protection has_error_code=1 idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 -#endif - #ifdef CONFIG_X86_MCE idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 4e91f6118d5d..7a4d2062385c 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -83,6 +83,8 @@ #define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) /* Crash MSR available */ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +/* Support for debug MSRs available */ +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) /* stimer Direct Mode is available */ #define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1c0b62d26962..1da5858501ca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -83,6 +83,10 @@ #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -107,15 +111,8 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -enum { - PT_PAGE_TABLE_LEVEL = 1, - PT_DIRECTORY_LEVEL = 2, - PT_PDPE_LEVEL = 3, - /* set max level to the biggest one */ - PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL, -}; -#define KVM_NR_PAGE_SIZES (PT_MAX_HUGEPAGE_LEVEL - \ - PT_PAGE_TABLE_LEVEL + 1) +#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G +#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) @@ -124,7 +121,7 @@ enum { static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) { - /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); } @@ -164,9 +161,13 @@ enum kvm_reg { NR_VCPU_REGS, VCPU_EXREG_PDPTR = NR_VCPU_REGS, + VCPU_EXREG_CR0, VCPU_EXREG_CR3, + VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, + VCPU_EXREG_EXIT_INFO_1, + VCPU_EXREG_EXIT_INFO_2, }; enum { @@ -182,8 +183,10 @@ enum { enum exit_fastpath_completion { EXIT_FASTPATH_NONE, - EXIT_FASTPATH_SKIP_EMUL_INS, + EXIT_FASTPATH_REENTER_GUEST, + EXIT_FASTPATH_EXIT_HANDLED, }; +typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; @@ -372,12 +375,12 @@ struct rsvd_bits_validate { }; struct kvm_mmu_root_info { - gpa_t cr3; + gpa_t pgd; hpa_t hpa; }; #define KVM_MMU_ROOT_INFO_INVALID \ - ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE }) #define KVM_MMU_NUM_PREV_ROOTS 3 @@ -403,7 +406,7 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - gpa_t root_cr3; + gpa_t root_pgd; union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; @@ -598,6 +601,7 @@ struct kvm_vcpu_arch { u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; + u64 perf_capabilities; /* * Paging state of the vcpu @@ -650,7 +654,6 @@ struct kvm_vcpu_arch { u64 xcr0; u64 guest_supported_xcr0; - u32 guest_xstate_size; struct kvm_pio_request pio; void *pio_data; @@ -680,6 +683,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; + int tdp_level; /* emulate context */ @@ -703,6 +707,7 @@ struct kvm_vcpu_arch { struct gfn_to_pfn_cache cache; } st; + u64 l1_tsc_offset; u64 tsc_offset; u64 last_guest_tsc; u64 last_host_tsc; @@ -762,14 +767,17 @@ struct kvm_vcpu_arch { struct { bool halted; - gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; + gfn_t gfns[ASYNC_PF_PER_VCPU]; struct gfn_to_hva_cache data; - u64 msr_val; + u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */ + u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ + u16 vec; u32 id; bool send_user_only; - u32 host_apf_reason; + u32 host_apf_flags; unsigned long nested_apf_token; bool delivery_as_pf_vmexit; + bool pageready_pending; } apf; /* OSVW MSRs (AMD only) */ @@ -855,6 +863,18 @@ struct kvm_apic_map { struct kvm_lapic *phys_map[]; }; +/* Hyper-V synthetic debugger (SynDbg)*/ +struct kvm_hv_syndbg { + struct { + u64 control; + u64 status; + u64 send_page; + u64 recv_page; + u64 pending_page; + } control; + u64 options; +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -878,6 +898,7 @@ struct kvm_hv { atomic_t num_mismatched_vp_indexes; struct hv_partition_assist_pg *hv_pa_pg; + struct kvm_hv_syndbg hv_syndbg; }; enum kvm_irqchip_mode { @@ -1028,6 +1049,8 @@ struct kvm_vcpu_stat { u64 irq_injections; u64 nmi_injections; u64 req_event; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; }; struct x86_instruction_info; @@ -1059,7 +1082,7 @@ struct kvm_x86_ops { void (*hardware_disable)(void); void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); - bool (*has_emulated_msr)(int index); + bool (*has_emulated_msr)(u32 index); void (*cpuid_update)(struct kvm_vcpu *vcpu); unsigned int vm_size; @@ -1085,8 +1108,6 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); @@ -1100,7 +1121,8 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + void (*tlb_flush_all)(struct kvm_vcpu *vcpu); + void (*tlb_flush_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@ -1113,7 +1135,13 @@ struct kvm_x86_ops { */ void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); - void (*run)(struct kvm_vcpu *vcpu); + /* + * Flush any TLB entries created by the guest. Like tlb_flush_gva(), + * does not need to flush GPA->HPA mappings. + */ + void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + + enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1126,8 +1154,8 @@ struct kvm_x86_ops { void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); - int (*interrupt_allowed)(struct kvm_vcpu *vcpu); - int (*nmi_allowed)(struct kvm_vcpu *vcpu); + int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); @@ -1141,7 +1169,7 @@ struct kvm_x86_ops { bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); - void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); + void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -1153,7 +1181,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); @@ -1163,10 +1190,8 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception); - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - int (*check_nested_events)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1199,6 +1224,7 @@ struct kvm_x86_ops { /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + const struct kvm_x86_nested_ops *nested_ops; /* * Architecture specific hooks for vCPU blocking due to @@ -1226,18 +1252,10 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - int (*get_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - unsigned user_data_size); - int (*set_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); - - int (*smi_allowed)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); - int (*enable_smi_window)(struct kvm_vcpu *vcpu); + void (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1245,14 +1263,28 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version); - uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + + void (*migrate_timers)(struct kvm_vcpu *vcpu); +}; + +struct kvm_x86_nested_ops { + int (*check_events)(struct kvm_vcpu *vcpu); + bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + int (*get_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + + int (*enable_evmcs)(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); + uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { @@ -1451,6 +1483,8 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long pay void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); @@ -1478,6 +1512,8 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); +void kvm_update_dr7(struct kvm_vcpu *vcpu); + int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); @@ -1508,8 +1544,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, + bool skip_mmu_sync); void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); @@ -1573,8 +1612,6 @@ enum { }; #define HF_GIF_MASK (1 << 0) -#define HF_HIF_MASK (1 << 1) -#define HF_VINTR_MASK (1 << 2) #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ @@ -1640,7 +1677,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 9b4df6eaa11a..57fd1966c4ea 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,11 +88,21 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); +void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); -u32 kvm_read_and_reset_pf_reason(void); -extern void kvm_disable_steal_time(void); -void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); +u32 kvm_read_and_reset_apf_flags(void); +void kvm_disable_steal_time(void); +bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); + +DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + +static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + if (static_branch_unlikely(&kvm_async_pf_enabled)) + return __kvm_handle_async_pf(regs, token); + else + return false; +} #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init kvm_spinlock_init(void); @@ -103,7 +113,7 @@ static inline void kvm_spinlock_init(void) #endif /* CONFIG_PARAVIRT_SPINLOCKS */ #else /* CONFIG_KVM_GUEST */ -#define kvm_async_pf_task_wait(T, I) do {} while(0) +#define kvm_async_pf_task_wait_schedule(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) @@ -121,7 +131,7 @@ static inline unsigned int kvm_arch_para_hints(void) return 0; } -static inline u32 kvm_read_and_reset_pf_reason(void) +static inline u32 kvm_read_and_reset_apf_flags(void) { return 0; } @@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void) { return; } + +static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + return false; +} #endif #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6ece8561ba66..8a1f5382a4ea 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -96,7 +96,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ - u8 reserved_7[768]; }; @@ -203,8 +202,16 @@ struct __attribute__ ((__packed__)) vmcb_save_area { u64 last_excp_to; }; + +static inline void __unused_size_checks(void) +{ + BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); + BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); +} + struct __attribute__ ((__packed__)) vmcb { struct vmcb_control_area control; + u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; }; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5e090d1f03f8..cd7de4b401fe 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -527,10 +527,12 @@ struct vmx_msr_entry { /* * Exit Qualifications for entry failure during or after loading guest state */ -#define ENTRY_FAIL_DEFAULT 0 -#define ENTRY_FAIL_PDPTE 2 -#define ENTRY_FAIL_NMI 3 -#define ENTRY_FAIL_VMCS_LINK_PTR 4 +enum vm_entry_failure_code { + ENTRY_FAIL_DEFAULT = 0, + ENTRY_FAIL_PDPTE = 2, + ENTRY_FAIL_NMI = 3, + ENTRY_FAIL_VMCS_LINK_PTR = 4, +}; /* * Exit Qualifications for EPT Violations diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 96d9cd208610..6807153c0410 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -50,14 +50,12 @@ struct x86_init_resources { * @pre_vector_init: init code to run before interrupt vectors * are set up. * @intr_init: interrupt init code - * @trap_init: platform specific trap setup * @intr_mode_select: interrupt delivery mode selection * @intr_mode_init: interrupt delivery mode setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); - void (*trap_init)(void); void (*intr_mode_select)(void); void (*intr_mode_init)(void); }; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 3f3f780c8c65..17c5a038f42d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -385,32 +385,48 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */ +#define KVM_STATE_NESTED_FORMAT_SVM 1 #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 #define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_MTF_PENDING 0x00000008 +#define KVM_STATE_NESTED_GIF_SET 0x00000100 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 #define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 +#define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000 + +#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; }; struct kvm_vmx_nested_state_hdr { + __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; + __u64 preemption_timer_deadline; struct { __u16 flags; } smm; }; +struct kvm_svm_nested_state_data { + /* Save area only used if KVM_STATE_NESTED_RUN_PENDING. */ + __u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE]; +}; + +struct kvm_svm_nested_state_hdr { + __u64 vmcb_pa; +}; + /* for KVM_CAP_NESTED_STATE */ struct kvm_nested_state { __u16 flags; @@ -419,6 +435,7 @@ struct kvm_nested_state { union { struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; /* Pad the header to 128 bytes. */ __u8 pad[120]; @@ -431,6 +448,7 @@ struct kvm_nested_state { */ union { struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; } data; }; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 2a8e0b6b9805..812e9b4c1114 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -31,6 +31,7 @@ #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 +#define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_HINTS_REALTIME 0 @@ -50,6 +51,8 @@ #define MSR_KVM_STEAL_TIME 0x4b564d03 #define MSR_KVM_PV_EOI_EN 0x4b564d04 #define MSR_KVM_POLL_CONTROL 0x4b564d05 +#define MSR_KVM_ASYNC_PF_INT 0x4b564d06 +#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07 struct kvm_steal_time { __u64 steal; @@ -81,6 +84,11 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) #define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) +#define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) + +/* MSR_KVM_ASYNC_PF_INT */ +#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) + /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 @@ -112,8 +120,13 @@ struct kvm_mmu_op_release_pt { #define KVM_PV_REASON_PAGE_READY 2 struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; + /* Used for 'page not present' events delivered via #PF */ + __u32 flags; + + /* Used for 'page ready' events delivered via interrupt notification */ + __u32 token; + + __u8 pad[56]; __u32 enabled; }; diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index e95b72ec19bc..b8ff9e8ac0d5 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -150,6 +150,9 @@ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" } +#define VMX_EXIT_REASON_FLAGS \ + { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } + #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6efe0410fb72..d6f22a3a1f7d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -35,6 +35,8 @@ #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + static int kvmapf = 1; static int __init parse_no_kvmapf(char *arg) @@ -73,7 +75,6 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; - bool halted; }; static struct kvm_task_sleep_head { @@ -96,77 +97,64 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -/* - * @interrupt_kernel: Is this called from a routine which interrupts the kernel - * (other than user space)? - */ -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); + struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); - kfree(e); raw_spin_unlock(&b->lock); - - rcu_irq_exit(); - return; + kfree(e); + return false; } - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || - (IS_ENABLED(CONFIG_PREEMPT_COUNT) - ? preempt_count() > 1 || rcu_preempt_depth() - : interrupt_kernel); - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, &n)) + return; for (;;) { - if (!n.halted) - prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; - rcu_irq_exit(); - - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - native_safe_halt(); - local_irq_disable(); - } - - rcu_irq_enter(); + local_irq_enable(); + schedule(); + local_irq_disable(); } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; + finish_swait(&n.wq, &wait); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->halted) - smp_send_reschedule(n->cpu); - else if (swq_has_sleeper(&n->wq)) + if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); } @@ -175,12 +163,13 @@ static void apf_task_wake_all(void) int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); + n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } @@ -221,46 +210,61 @@ again: n->cpu = smp_processor_id(); init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); - } else + } else { apf_task_wake_one(n); + } raw_spin_unlock(&b->lock); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_pf_reason(void) +u32 kvm_read_and_reset_apf_flags(void) { - u32 reason = 0; + u32 flags = 0; if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); } - return reason; + return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); +NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags); -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - switch (kvm_read_and_reset_pf_reason()) { - default: - do_page_fault(regs, error_code, address); - break; + u32 reason = kvm_read_and_reset_apf_flags(); + + switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: - /* page is swapped out by the host. */ - kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - break; case KVM_PV_REASON_PAGE_READY: + break; + default: + return false; + } + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); + } else { rcu_irq_enter(); - kvm_async_pf_task_wake((u32)address); + kvm_async_pf_task_wake(token); rcu_irq_exit(); - break; } + return true; } -NOKPROBE_SYMBOL(do_async_page_fault); +NOKPROBE_SYMBOL(__kvm_handle_async_pf); static void __init paravirt_ops_setup(void) { @@ -306,11 +310,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; -#ifdef CONFIG_PREEMPTION - pa |= KVM_ASYNC_PF_SEND_ALWAYS; -#endif + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); pa |= KVM_ASYNC_PF_ENABLED; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) @@ -318,12 +322,12 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); + pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); @@ -344,8 +348,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); + pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); } static void kvm_pv_guest_cpu_reboot(void *unused) @@ -592,12 +595,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu) } #endif -static void __init kvm_apf_trap_init(void) -{ - update_intr_gate(X86_TRAP_PF, async_page_fault); -} - - static void kvm_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -632,8 +629,6 @@ static void __init kvm_guest_init(void) register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; @@ -649,6 +644,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + static_branch_enable(&kvm_async_pf_enabled); + #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 428186d9de46..4cc541051994 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -903,7 +903,5 @@ void __init trap_init(void) idt_setup_ist_traps(); - x86_init.irqs.trap_init(); - idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 85f1a90c55cd..123f1c1f1788 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, - .trap_init = x86_init_noop, .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init }, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 901cd1fdecd9..253b8e875ccd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -86,12 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - vcpu->arch.guest_xstate_size = best->ebx = - xstate_required_size(vcpu->arch.xcr0, false); + best->ebx = xstate_required_size(vcpu->arch.xcr0, false); } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); @@ -124,8 +122,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) MSR_IA32_MISC_ENABLE_MWAIT); } - /* Update physical-address width */ + /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); @@ -297,7 +296,7 @@ void kvm_set_cpu_caps(void) F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | @@ -712,7 +711,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | - (1 << KVM_FEATURE_PV_SCHED_YIELD); + (1 << KVM_FEATURE_PV_SCHED_YIELD) | + (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); @@ -728,6 +728,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; + case 0x80000006: + /* L2 cache and TLB: pass through host info. */ + break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ entry->edx &= (1 << 8); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 63a70f6a3df3..05434cd9342f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -303,4 +303,9 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bddaba9c68dd..de5476f8683e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5798,6 +5798,8 @@ writeback: } ctxt->eip = ctxt->_eip; + if (ctxt->mode != X86EMUL_MODE_PROT64) + ctxt->eip = (u32)ctxt->_eip; done: if (rc == X86EMUL_PROPAGATE_FAULT) { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6bc6d7613f76..238b78e069fe 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -21,6 +21,7 @@ #include "x86.h" #include "lapic.h" #include "ioapic.h" +#include "cpuid.h" #include "hyperv.h" #include <linux/cpu.h> @@ -266,6 +267,123 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } +static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, + HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, + 0); + if (!entry) + return false; + + return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; +} + +static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) + hv->hv_syndbg.control.status = + vcpu->run->hyperv.u.syndbg.status; + return 1; +} + +static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; + hv_vcpu->exit.u.syndbg.msr = msr; + hv_vcpu->exit.u.syndbg.control = syndbg->control.control; + hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; + hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; + hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; + vcpu->arch.complete_userspace_io = + kvm_hv_syndbg_complete_userspace; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + syndbg->control.control = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_STATUS: + syndbg->control.status = data; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + syndbg->control.send_page = data; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + syndbg->control.recv_page = data; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + syndbg->control.pending_page = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + syndbg->options = data; + break; + default: + break; + } + + return 0; +} + +static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + *pdata = syndbg->control.control; + break; + case HV_X64_MSR_SYNDBG_STATUS: + *pdata = syndbg->control.status; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + *pdata = syndbg->control.send_page; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + *pdata = syndbg->control.recv_page; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + *pdata = syndbg->control.pending_page; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + *pdata = syndbg->options; + break; + default: + break; + } + + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, + *pdata); + + return 0; +} + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { @@ -800,6 +918,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } @@ -1061,6 +1181,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1190,7 +1313,8 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 0; } -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; @@ -1227,6 +1351,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1316,7 +1443,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) int r; mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else @@ -1425,8 +1552,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, - KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: @@ -1530,7 +1656,7 @@ ret_success: bool kvm_hv_hypercall_enabled(struct kvm *kvm) { - return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; + return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) @@ -1709,6 +1835,34 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); break; + case HVCALL_POST_DEBUG_DATA: + case HVCALL_RETRIEVE_DEBUG_DATA: + if (unlikely(fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + fallthrough; + case HVCALL_RESET_DEBUG_SESSION: { + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu)) { + ret = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { + ret = HV_STATUS_OPERATION_DENIED; + break; + } + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = param; + vcpu->run->hyperv.u.hcall.params[0] = ingpa; + vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->arch.complete_userspace_io = + kvm_hv_hypercall_complete_userspace; + return 0; + } default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -1796,12 +1950,15 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, + { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); - if (kvm_x86_ops.nested_get_evmcs_version) - evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu); + if (kvm_x86_ops.nested_ops->get_evmcs_version) + evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); /* Skip NESTED_FEATURES if eVMCS is not supported */ if (!evmcs_ver) @@ -1821,7 +1978,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); - ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; @@ -1860,6 +2017,10 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->ebx |= HV_DEBUGGING; + ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; + ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + /* * Direct Synthetic timers only make sense with in-kernel * LAPIC @@ -1903,6 +2064,24 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; + case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = 0; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_SYNDBG_INTERFACE: + memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: + ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + break; + default: break; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 757cb578101c..e68c6c2e9649 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -23,6 +23,33 @@ #include <linux/kvm_host.h> +/* + * The #defines related to the synthetic debugger are required by KDNet, but + * they are not documented in the Hyper-V TLFS because the synthetic debugger + * functionality has been deprecated and is subject to removal in future + * versions of Windows. + */ +#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080 +#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081 +#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082 + +/* + * Hyper-V synthetic debugger platform capabilities + * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits. + */ +#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1) + +/* Hyper-V Synthetic debug options MSR */ +#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1 +#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2 +#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3 +#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4 +#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5 +#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF + +/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */ +#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2) + static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) { return &vcpu->arch.hyperv; @@ -46,6 +73,11 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); } +static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +{ + return &vcpu->kvm->arch.hyperv.hv_syndbg; +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 2fb2e3c80724..660401700075 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -3,8 +3,8 @@ #define __KVM_IO_APIC_H #include <linux/kvm_host.h> - #include <kvm/iodev.h> +#include "irq.h" struct kvm; struct kvm_vcpu; @@ -108,11 +108,7 @@ do { \ static inline int ioapic_in_kernel(struct kvm *kvm) { - int mode = kvm->arch.irqchip_mode; - - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); - return mode == KVM_IRQCHIP_KERNEL; + return irqchip_kernel(kvm); } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index e330e7d125f7..99d118ffc67d 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -83,6 +83,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr); /* * check if there is pending interrupt without @@ -159,6 +160,8 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); + if (kvm_x86_ops.migrate_timers) + kvm_x86_ops.migrate_timers(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f173ab6b407e..9b64abf9b3f1 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -16,7 +16,6 @@ #include <linux/spinlock.h> #include <kvm/iodev.h> -#include "ioapic.h" #include "lapic.h" #define PIC_NUM_PINS 16 @@ -66,15 +65,6 @@ void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -static inline int pic_in_kernel(struct kvm *kvm) -{ - int mode = kvm->arch.irqchip_mode; - - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); - return mode == KVM_IRQCHIP_KERNEL; -} - static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; @@ -93,6 +83,11 @@ static inline int irqchip_kernel(struct kvm *kvm) return mode == KVM_IRQCHIP_KERNEL; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + return irqchip_kernel(kvm); +} + static inline int irqchip_in_kernel(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 62558b9bdda7..ff2d0e9ca3bc 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -116,8 +116,9 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; - if (tmask & vcpu->arch.cr0_guest_owned_bits) - kvm_x86_ops.decache_cr0_guest_bits(vcpu); + if ((tmask & vcpu->arch.cr0_guest_owned_bits) && + !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } @@ -129,8 +130,9 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; - if (tmask & vcpu->arch.cr4_guest_owned_bits) - kvm_x86_ops.decache_cr4_guest_bits(vcpu); + if ((tmask & vcpu->arch.cr4_guest_owned_bits) && + !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9af25c97612a..34a7e0533dad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -36,6 +36,7 @@ #include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" +#include "ioapic.h" #include "trace.h" #include "x86.h" #include "cpuid.h" @@ -110,11 +111,18 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) return apic->vcpu->vcpu_id; } -bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) +static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); } -EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt); + +bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops.set_hv_timer + && !(kvm_mwait_in_guest(vcpu->kvm) || + kvm_can_post_timer_interrupt(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer); static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { @@ -1593,7 +1601,7 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) } } -static void apic_timer_expired(struct kvm_lapic *apic) +static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) { struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_timer *ktimer = &apic->lapic_timer; @@ -1604,6 +1612,12 @@ static void apic_timer_expired(struct kvm_lapic *apic) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; + if (!from_timer_fn && vcpu->arch.apicv_active) { + WARN_ON(kvm_get_running_vcpu() != vcpu); + kvm_apic_inject_pending_timer_irqs(apic); + return; + } + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { if (apic->lapic_timer.timer_advance_ns) __kvm_wait_lapic_expire(vcpu); @@ -1643,18 +1657,23 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD); } else - apic_timer_expired(apic); + apic_timer_expired(apic, false); local_irq_restore(flags); } +static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict) +{ + return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count; +} + static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) { ktime_t now, remaining; u64 ns_remaining_old, ns_remaining_new; - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; + apic->lapic_timer.period = + tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); limit_periodic_timer_frequency(apic); now = ktime_get(); @@ -1672,14 +1691,15 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); } -static bool set_target_expiration(struct kvm_lapic *apic) +static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) { ktime_t now; u64 tscl = rdtsc(); + s64 deadline; now = ktime_get(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; + apic->lapic_timer.period = + tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); if (!apic->lapic_timer.period) { apic->lapic_timer.tscdeadline = 0; @@ -1687,10 +1707,32 @@ static bool set_target_expiration(struct kvm_lapic *apic) } limit_periodic_timer_frequency(apic); + deadline = apic->lapic_timer.period; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + if (unlikely(count_reg != APIC_TMICT)) { + deadline = tmict_to_ns(apic, + kvm_lapic_get_reg(apic, count_reg)); + if (unlikely(deadline <= 0)) + deadline = apic->lapic_timer.period; + else if (unlikely(deadline > apic->lapic_timer.period)) { + pr_info_ratelimited( + "kvm: vcpu %i: requested lapic timer restore with " + "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " + "Using initial count to start timer.\n", + apic->vcpu->vcpu_id, + count_reg, + kvm_lapic_get_reg(apic, count_reg), + deadline, apic->lapic_timer.period); + kvm_lapic_set_reg(apic, count_reg, 0); + deadline = apic->lapic_timer.period; + } + } + } apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + - nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); - apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + nsec_to_cycles(apic->vcpu, deadline); + apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline); return true; } @@ -1723,7 +1765,7 @@ static void start_sw_period(struct kvm_lapic *apic) if (ktime_after(ktime_get(), apic->lapic_timer.target_expiration)) { - apic_timer_expired(apic); + apic_timer_expired(apic, false); if (apic_lvtt_oneshot(apic)) return; @@ -1760,7 +1802,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) bool expired; WARN_ON(preemptible()); - if (!kvm_x86_ops.set_hv_timer) + if (!kvm_can_use_hv_timer(vcpu)) return false; if (!ktimer->tscdeadline) @@ -1785,7 +1827,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (atomic_read(&ktimer->pending)) { cancel_hv_timer(apic); } else if (expired) { - apic_timer_expired(apic); + apic_timer_expired(apic, false); cancel_hv_timer(apic); } } @@ -1833,9 +1875,9 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; - WARN_ON(swait_active(&vcpu->wq)); + WARN_ON(rcuwait_active(&vcpu->wait)); cancel_hv_timer(apic); - apic_timer_expired(apic); + apic_timer_expired(apic, false); if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); @@ -1872,17 +1914,22 @@ void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) restart_apic_timer(apic); } -static void start_apic_timer(struct kvm_lapic *apic) +static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg) { atomic_set(&apic->lapic_timer.pending, 0); if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - && !set_target_expiration(apic)) + && !set_target_expiration(apic, count_reg)) return; restart_apic_timer(apic); } +static void start_apic_timer(struct kvm_lapic *apic) +{ + __start_apic_timer(apic, APIC_TMICT); +} + static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val); @@ -2336,7 +2383,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); - apic_timer_expired(apic); + apic_timer_expired(apic, true); if (lapic_is_periodic(apic)) { advance_periodic_target_expiration(apic); @@ -2493,6 +2540,14 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + + /* + * Get calculated timer current count for remaining timer period (if + * any) and store it in the returned register set. + */ + __kvm_lapic_set_reg(s->regs, APIC_TMCCT, + __apic_read(vcpu->arch.apic, APIC_TMCCT)); + return kvm_apic_state_fixup(vcpu, s, false); } @@ -2520,7 +2575,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); - start_apic_timer(apic); + __start_apic_timer(apic, APIC_TMCCT); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a0ffb4331418..754f29beb83e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -161,9 +161,14 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) return *((u32 *) (apic->regs + reg_off)); } +static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) +{ + *((u32 *) (regs + reg_off)) = val; +} + static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - *((u32 *) (apic->regs + reg_off)) = val; + __kvm_lapic_set_reg(apic->regs, reg_off, val); } extern struct static_key kvm_no_apic_vcpu; @@ -245,7 +250,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu); +bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 8a3b1bce722a..0ad06bfe2c2c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,13 +51,13 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fd59fee84631..fdd05c233308 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -16,6 +16,7 @@ */ #include "irq.h" +#include "ioapic.h" #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" @@ -78,6 +79,9 @@ module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); +static bool __read_mostly force_flush_and_sync_on_reuse; +module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -244,7 +248,6 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; @@ -331,19 +334,19 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - BUG_ON((mmio_mask & mmio_value) != mmio_value); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_value; + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -566,7 +569,6 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_dirty_mask = 0; shadow_nx_mask = 0; shadow_x_mask = 0; - shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; @@ -583,16 +585,15 @@ static void kvm_mmu_reset_all_pte_masks(void) * the most significant bits of legal physical address space. */ shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_cache_bits; - if (boot_cpu_data.x86_cache_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) { + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; shadow_nonpresent_or_rsvd_mask = - rsvd_bits(boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len, - boot_cpu_data.x86_cache_bits - 1); - low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; - } else - WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF)); + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); @@ -620,7 +621,7 @@ static int is_large_pte(u64 pte) static int is_last_spte(u64 pte, int level) { - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) return 1; if (is_large_pte(pte)) return 1; @@ -1196,7 +1197,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, struct kvm_lpage_info *linfo; int i; - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; WARN_ON(linfo->disallow_lpage < 0); @@ -1225,7 +1226,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); @@ -1253,7 +1254,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); @@ -1398,7 +1399,7 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, @@ -1529,8 +1530,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == - PT_PAGE_TABLE_LEVEL); + WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1682,7 +1682,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1708,7 +1708,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ @@ -1760,7 +1760,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } @@ -1948,8 +1948,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_start = hva_to_gfn_memslot(hva_start, memslot); gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, + for_each_slot_rmap_range(memslot, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, &iterator) ret |= handler(kvm, iterator.rmap, memslot, @@ -2153,10 +2153,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) -{ -} - static void nonpaging_update_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) @@ -2313,7 +2309,7 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, return; if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } #ifdef CONFIG_KVM_MMU_AUDIT @@ -2347,7 +2343,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, if (!s->unsync) continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(s->role.level != PG_LEVEL_4K); ret |= kvm_sync_page(vcpu, s, invalid_list); } @@ -2376,7 +2372,7 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec, int level = sp->role.level; parents->idx[level-1] = idx; - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) break; parents->parent[level-2] = sp; @@ -2398,7 +2394,7 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PT_PAGE_TABLE_LEVEL); + WARN_ON(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -2520,11 +2516,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); @@ -2546,11 +2542,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * be inconsistent with guest page table. */ account_shadowed(vcpu->kvm, sp); - if (level == PT_PAGE_TABLE_LEVEL && - rmap_write_protect(vcpu, gfn)) + if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) + if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } clear_page(sp->spt); @@ -2601,7 +2596,7 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { - if (iterator->level < PT_PAGE_TABLE_LEVEL) + if (iterator->level < PG_LEVEL_4K) return false; iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); @@ -2722,7 +2717,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct mmu_page_path parents; struct kvm_mmu_pages pages; - if (parent->role.level == PT_PAGE_TABLE_LEVEL) + if (parent->role.level == PG_LEVEL_4K) return 0; while (mmu_unsync_walk(parent, &pages)) { @@ -2921,7 +2916,7 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, if (sp->unsync) continue; - WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } @@ -3020,7 +3015,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); - if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled()) { pte_access &= ~ACC_EXEC_MASK; } @@ -3033,7 +3028,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (level > PT_PAGE_TABLE_LEVEL) + if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, @@ -3103,8 +3098,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (level > PT_PAGE_TABLE_LEVEL && - !is_large_pte(*sptep)) { + if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; @@ -3125,7 +3119,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { if (write_fault) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) @@ -3228,7 +3222,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp_ad_disabled(sp)) return; - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return; __direct_pte_prefetch(vcpu, sp, sptep); @@ -3241,12 +3235,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, pte_t *pte; int level; - BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || - PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || - PT_PDPE_LEVEL != (int)PG_LEVEL_1G); - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() @@ -3260,7 +3250,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); if (unlikely(!pte)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; return level; } @@ -3274,28 +3264,28 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; - if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) - return PT_PAGE_TABLE_LEVEL; + if (unlikely(max_level == PG_LEVEL_4K)) + return PG_LEVEL_4K; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); if (!slot) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; max_level = min(max_level, max_page_level); - for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } - if (max_level == PT_PAGE_TABLE_LEVEL) - return PT_PAGE_TABLE_LEVEL; + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) return level; level = min(level, max_level); @@ -3317,7 +3307,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, int level = *levelp; u64 spte = *it.sptep; - if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + if (it.level == level && level > PG_LEVEL_4K && is_nx_huge_page_enabled() && is_shadow_present_pte(spte) && !is_large_pte(spte)) { @@ -3574,7 +3564,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) break; } @@ -3666,7 +3656,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, &invalid_list); mmu->root_hpa = INVALID_PAGE; } - mmu->root_cr3 = 0; + mmu->root_pgd = 0; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3686,58 +3676,64 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, + u8 level, bool direct) { struct kvm_mmu_page *sp; + + spin_lock(&vcpu->kvm->mmu_lock); + + if (make_mmu_pages_available(vcpu)) { + spin_unlock(&vcpu->kvm->mmu_lock); + return INVALID_PAGE; + } + sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + ++sp->root_count; + + spin_unlock(&vcpu->kvm->mmu_lock); + return __pa(sp->spt); +} + +static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) +{ + u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; + hpa_t root; unsigned i; - if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { - spin_lock(&vcpu->kvm->mmu_lock); - if(make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); + if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (!VALID_PAGE(root)) return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, 0, 0, - vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu->root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) { + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); - MMU_WARN_ON(VALID_PAGE(root)); - spin_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), + i << 30, PT32_ROOT_LEVEL, true); + if (!VALID_PAGE(root)) return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); - /* root_cr3 is ignored for direct MMUs. */ - vcpu->arch.mmu->root_cr3 = 0; + /* root_pgd is ignored for direct MMUs. */ + vcpu->arch.mmu->root_pgd = 0; return 0; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *sp; u64 pdptr, pm_mask; - gfn_t root_gfn, root_cr3; + gfn_t root_gfn, root_pgd; + hpa_t root; int i; - root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); - root_gfn = root_cr3 >> PAGE_SHIFT; + root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3747,22 +3743,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * write-protect the guests page table root. */ if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu->root_hpa; - - MMU_WARN_ON(VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); - spin_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); + root = mmu_alloc_root(vcpu, root_gfn, 0, + vcpu->arch.mmu->shadow_root_level, false); + if (!VALID_PAGE(root)) return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu->root_hpa = root; - goto set_root_cr3; + goto set_root_pgd; } /* @@ -3775,9 +3763,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; - - MMU_WARN_ON(VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { @@ -3788,17 +3774,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu_check_root(vcpu, root_gfn)) return 1; } - spin_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); - return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, - 0, ACC_ALL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); + root = mmu_alloc_root(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, false); + if (!VALID_PAGE(root)) + return -ENOSPC; vcpu->arch.mmu->pae_root[i] = root | pm_mask; } vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); @@ -3828,8 +3808,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } -set_root_cr3: - vcpu->arch.mmu->root_cr3 = root_cr3; +set_root_pgd: + vcpu->arch.mmu->root_pgd = root_pgd; return 0; } @@ -4083,18 +4063,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable) { - struct kvm_memory_slot *slot; + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; - /* - * Don't expose private memslots to L2. - */ - if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { *pfn = KVM_PFN_NOSLOT; + *writable = false; return false; } - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) @@ -4135,7 +4113,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return r; if (lpage_disallowed) - max_level = PT_PAGE_TABLE_LEVEL; + max_level = PG_LEVEL_4K; if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; @@ -4171,7 +4149,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PT_DIRECTORY_LEVEL, false); + PG_LEVEL_2M, false); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4186,7 +4164,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_reason) { + switch (vcpu->arch.apf.host_apf_flags) { default: trace_kvm_page_fault(fault_address, error_code); @@ -4196,13 +4174,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, insn_len); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: - vcpu->arch.apf.host_apf_reason = 0; + vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address, 0); + kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_reason = 0; + vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wake(fault_address); local_irq_enable(); @@ -4217,8 +4195,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, { int max_level; - for (max_level = PT_MAX_HUGEPAGE_LEVEL; - max_level > PT_PAGE_TABLE_LEVEL; + for (max_level = KVM_MAX_HUGEPAGE_LEVEL; + max_level > PG_LEVEL_4K; max_level--) { int page_num = KVM_PAGES_PER_HPAGE(max_level); gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); @@ -4237,7 +4215,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; + context->invlpg = NULL; context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; @@ -4245,51 +4223,50 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3, +static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || cr3 == root->cr3) && + return (role.direct || pgd == root->pgd) && VALID_PAGE(root->hpa) && page_header(root->hpa) && role.word == page_header(root->hpa)->role.word; } /* - * Find out if a previously cached root matching the new CR3/role is available. + * Find out if a previously cached root matching the new pgd/role is available. * The current root is also inserted into the cache. * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is * returned. * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and * false is returned. This root should now be freed by the caller. */ -static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; struct kvm_mmu_root_info root; struct kvm_mmu *mmu = vcpu->arch.mmu; - root.cr3 = mmu->root_cr3; + root.pgd = mmu->root_pgd; root.hpa = mmu->root_hpa; - if (is_root_usable(&root, new_cr3, new_role)) + if (is_root_usable(&root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { swap(root, mmu->prev_roots[i]); - if (is_root_usable(&root, new_cr3, new_role)) + if (is_root_usable(&root, new_pgd, new_role)) break; } mmu->root_hpa = root.hpa; - mmu->root_cr3 = root.cr3; + mmu->root_pgd = root.pgd; return i < KVM_MMU_NUM_PREV_ROOTS; } -static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, - union kvm_mmu_page_role new_role, - bool skip_tlb_flush) +static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, + union kvm_mmu_page_role new_role) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -4299,70 +4276,59 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, * later if necessary. */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - mmu->root_level >= PT64_ROOT_4LEVEL) { - if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) - return false; - - if (cached_root_available(vcpu, new_cr3, new_role)) { - /* - * It is possible that the cached previous root page is - * obsolete because of a change in the MMU generation - * number. However, changing the generation number is - * accompanied by KVM_REQ_MMU_RELOAD, which will free - * the root set here and allocate a new one. - */ - kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - if (!skip_tlb_flush) { - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - - /* - * The last MMIO access's GVA and GPA are cached in the - * VCPU. When switching to a new CR3, that GVA->GPA - * mapping may no longer be valid. So clear any cached - * MMIO info even when we don't need to sync the shadow - * page tables. - */ - vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - - __clear_sp_write_flooding_count( - page_header(mmu->root_hpa)); - - return true; - } - } + mmu->root_level >= PT64_ROOT_4LEVEL) + return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && + cached_root_available(vcpu, new_pgd, new_role); return false; } -static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role, - bool skip_tlb_flush) + bool skip_tlb_flush, bool skip_mmu_sync) { - if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, - KVM_MMU_ROOT_CURRENT); + if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); + return; + } + + /* + * It's possible that the cached previous root page is obsolete because + * of a change in the MMU generation number. However, changing the + * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will + * free the root set here and allocate a new one. + */ + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); + + if (!skip_mmu_sync || force_flush_and_sync_on_reuse) + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + if (!skip_tlb_flush || force_flush_and_sync_on_reuse) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + + /* + * The last MMIO access's GVA and GPA are cached in the VCPU. When + * switching to a new CR3, that GVA->GPA mapping may no longer be + * valid. So clear any cached MMIO info even when we don't need to sync + * the shadow page tables. + */ + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); + + __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, + bool skip_mmu_sync) { - __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), - skip_tlb_flush); + __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu), + skip_tlb_flush, skip_mmu_sync); } -EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); +EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static unsigned long get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } -static void inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - vcpu->arch.mmu->inject_page_fault(vcpu, fault); -} - static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access, int *nr_present) { @@ -4391,11 +4357,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, gpte &= level - mmu->last_nonleaf_level; /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + gpte |= level - PG_LEVEL_4K - 1; return gpte & PT_PAGE_SIZE_MASK; } @@ -4909,7 +4875,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = kvm_x86_ops.get_tdp_level(vcpu); + role.base.level = vcpu->arch.tdp_level; role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4928,9 +4894,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->mmu_role.as_u64 = new_role.as_u64; context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; + context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + context->shadow_root_level = vcpu->arch.tdp_level; context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4986,7 +4952,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) { struct kvm_mmu *context = vcpu->arch.mmu; union kvm_mmu_role new_role = @@ -4995,11 +4961,11 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) if (new_role.as_u64 == context->mmu_role.as_u64) return; - if (!is_paging(vcpu)) + if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); - else if (is_long_mode(vcpu)) + else if (efer & EFER_LMA) paging64_init_context(vcpu, context); - else if (is_pae(vcpu)) + else if (cr4 & X86_CR4_PAE) paging32E_init_context(vcpu, context); else paging32_init_context(vcpu, context); @@ -5047,7 +5013,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false); + __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true); if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -5077,7 +5043,11 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.mmu; - kvm_init_shadow_mmu(vcpu); + kvm_init_shadow_mmu(vcpu, + kvm_read_cr0_bits(vcpu, X86_CR0_PG), + kvm_read_cr4_bits(vcpu, X86_CR4_PAE), + vcpu->arch.efer); + context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; @@ -5097,6 +5067,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->inject_page_fault = kvm_inject_page_fault; /* + * L2 page tables are never shadowed, so there is no need to sync + * SPTEs. + */ + g_context->invlpg = NULL; + + /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using @@ -5183,7 +5159,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_pgd(vcpu); - kvm_x86_ops.tlb_flush(vcpu, true); + kvm_x86_ops.tlb_flush_current(vcpu); out: return r; } @@ -5202,7 +5178,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *new) { - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (sp->role.level != PG_LEVEL_4K) { ++vcpu->kvm->stat.mmu_pde_zapped; return; } @@ -5260,7 +5236,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (sp->role.level == PG_LEVEL_4K) return false; atomic_inc(&sp->write_flooding_count); @@ -5497,37 +5473,54 @@ emulate: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); -void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa) { - struct kvm_mmu *mmu = vcpu->arch.mmu; int i; - /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(gva, vcpu)) + /* It's actually a GPA for vcpu->arch.guest_mmu. */ + if (mmu != &vcpu->arch.guest_mmu) { + /* INVLPG on a non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + + kvm_x86_ops.tlb_flush_gva(vcpu, gva); + } + + if (!mmu->invlpg) return; - mmu->invlpg(vcpu, gva, mmu->root_hpa); + if (root_hpa == INVALID_PAGE) { + mmu->invlpg(vcpu, gva, mmu->root_hpa); - /* - * INVLPG is required to invalidate any global mappings for the VA, - * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether any of the prev_root mappings of the VA - * is marked global, or to just sync it blindly, so we might as well - * just always sync it. - * - * Mappings not reachable via the current cr3 or the prev_roots will be - * synced when switching to that cr3, so nothing needs to be done here - * for them. - */ - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (VALID_PAGE(mmu->prev_roots[i].hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. + * + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + } else { + mmu->invlpg(vcpu, gva, root_hpa); + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva); - kvm_x86_ops.tlb_flush_gva(vcpu, gva); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -5541,7 +5534,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } @@ -5574,9 +5567,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) if (tdp_enabled) max_page_level = tdp_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PT_PDPE_LEVEL; + max_page_level = PG_LEVEL_1G; else - max_page_level = PT_DIRECTORY_LEVEL; + max_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5632,24 +5625,24 @@ static __always_inline bool slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static __always_inline bool slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1, + KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, + PG_LEVEL_4K, lock_flush_tlb); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -5672,7 +5665,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && kvm_x86_ops.get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5695,13 +5688,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_cr3 = 0; + vcpu->arch.root_mmu.root_pgd = 0; vcpu->arch.root_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_cr3 = 0; + vcpu->arch.guest_mmu.root_pgd = 0; vcpu->arch.guest_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -5859,7 +5852,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true); } } @@ -5881,7 +5875,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, PT_MAX_HUGEPAGE_LEVEL, false); + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); spin_unlock(&kvm->mmu_lock); /* @@ -6142,27 +6136,18 @@ static void kvm_set_mmio_spte_mask(void) u64 mask; /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (shadow_phys_bits == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK); } static bool get_nx_auto_mode(void) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index ddc1ec3bdacd..a7bcde34d1f2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -61,7 +61,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, { int index, val; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); val = slot->arch.gfn_track[mode][index]; @@ -151,7 +151,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return false; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 9bdf9b7d9a96..38c576495048 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -75,7 +75,7 @@ #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) -#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) /* * The guest_walker structure emulates the behavior of the hardware page @@ -198,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; - if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) goto no_present; return false; @@ -436,7 +436,7 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) + if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); @@ -552,7 +552,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -575,7 +575,7 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, u64 mask; int r, index; - if (level == PT_PAGE_TABLE_LEVEL) { + if (level == PG_LEVEL_4K) { mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); @@ -600,7 +600,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, sp = page_header(__pa(sptep)); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return; if (sp->role.direct) @@ -812,7 +812,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, if (!r) { pgprintk("%s: guest page fault\n", __func__); if (!prefault) - inject_page_fault(vcpu, &walker.fault); + kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } @@ -828,7 +828,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (lpage_disallowed || is_self_change_mapping) - max_level = PT_PAGE_TABLE_LEVEL; + max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -884,7 +884,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -1070,7 +1070,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PT_PAGE_TABLE_LEVEL, + pte_access, PG_LEVEL_4K, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ca39f62aabc6..9d2844f87f6d 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -100,7 +100,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) sp = page_header(__pa(sptep)); if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { + if (level != PG_LEVEL_4K) { audit_printk(vcpu->kvm, "unsync sp: %p " "level = %d\n", sp, level); return; @@ -176,7 +176,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) { int i; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) + if (sp->role.level != PG_LEVEL_4K) return; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); + rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a5078841bdac..b86346903f2e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -397,9 +397,9 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) __set_bit(pmc->idx, pmu->pmc_in_use); } -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr, data); + return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a6c78a797cb1..ab85eed8a6cc 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -32,7 +32,7 @@ struct kvm_pmu_ops { struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void (*refresh)(struct kvm_vcpu *vcpu); void (*init)(struct kvm_vcpu *vcpu); @@ -147,7 +147,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9a2a62e5afeb..8a6db11dcb43 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,8 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "cpuid.h" +#include "lapic.h" #include "svm.h" static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -59,7 +61,7 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); - u64 cr3 = svm->nested.nested_cr3; + u64 cr3 = svm->nested.ctl.nested_cr3; u64 pdpte; int ret; @@ -74,19 +76,22 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - return svm->nested.nested_cr3; + return svm->nested.ctl.nested_cr3; } static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *hsave = svm->nested.hsave; + WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_mmu(vcpu); + kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level; reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -99,8 +104,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h; - struct nested_state *g; + struct vmcb_control_area *c, *h, *g; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -109,14 +113,16 @@ void recalc_intercepts(struct vcpu_svm *svm) c = &svm->vmcb->control; h = &svm->nested.hsave->control; - g = &svm->nested; + g = &svm->nested.ctl; + + svm->nested.host_intercept_exceptions = h->intercept_exceptions; c->intercept_cr = h->intercept_cr; c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; c->intercept = h->intercept; - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); @@ -138,11 +144,9 @@ void recalc_intercepts(struct vcpu_svm *svm) c->intercept |= g->intercept; } -static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +static void copy_vmcb_control_area(struct vmcb_control_area *dst, + struct vmcb_control_area *from) { - struct vmcb_control_area *dst = &dst_vmcb->control; - struct vmcb_control_area *from = &from_vmcb->control; - dst->intercept_cr = from->intercept_cr; dst->intercept_dr = from->intercept_dr; dst->intercept_exceptions = from->intercept_exceptions; @@ -150,7 +154,7 @@ static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; + /* asid not copied, it is handled manually for svm->vmcb. */ dst->tlb_ctl = from->tlb_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; @@ -179,7 +183,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -190,7 +194,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; - offset = svm->nested.vmcb_msrpm + (p * 4); + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) return false; @@ -203,41 +207,111 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } -static bool nested_vmcb_checks(struct vmcb *vmcb) +static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((vmcb->save.efer & EFER_SVME) == 0) + if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0) return false; - if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + if (control->asid == 0) return false; - if (vmcb->control.asid == 0) - return false; - - if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && + if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled) return false; return true; } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map) +static bool nested_vmcb_checks(struct vmcb *vmcb) { - bool evaluate_pending_interrupts = - is_intercept(svm, INTERCEPT_VINTR) || - is_intercept(svm, INTERCEPT_IRET); + if ((vmcb->save.efer & EFER_SVME) == 0) + return false; - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && + (vmcb->save.cr0 & X86_CR0_NW)) + return false; - if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { - svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; - nested_svm_init_mmu_context(&svm->vcpu); + return nested_vmcb_check_controls(&vmcb->control); +} + +static void load_nested_vmcb_control(struct vcpu_svm *svm, + struct vmcb_control_area *control) +{ + copy_vmcb_control_area(&svm->nested.ctl, control); + + /* Copy it here because nested_svm_check_controls will check it. */ + svm->nested.ctl.asid = control->asid; + svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; + svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; +} + +/* + * Synchronize fields that are written by the processor, so that + * they can be copied back into the nested_vmcb. + */ +void sync_nested_vmcb_control(struct vcpu_svm *svm) +{ + u32 mask; + svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; + svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; + + /* Only a few fields of int_ctl are written by the processor. */ + mask = V_IRQ_MASK | V_TPR_MASK; + if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && + is_intercept(svm, SVM_EXIT_VINTR)) { + /* + * In order to request an interrupt window, L0 is usurping + * svm->vmcb->control.int_ctl and possibly setting V_IRQ + * even if it was clear in L1's VMCB. Restoring it would be + * wrong. However, in this case V_IRQ will remain true until + * interrupt_window_interception calls svm_clear_vintr and + * restores int_ctl. We can just leave it aside. + */ + mask &= ~V_IRQ_MASK; } + svm->nested.ctl.int_ctl &= ~mask; + svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; +} + +/* + * Transfer any event that L0 or L1 wanted to inject into L2 to + * EXIT_INT_INFO. + */ +static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, + struct vmcb *nested_vmcb) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 exit_int_info = 0; + unsigned int nr; + + if (vcpu->arch.exception.injected) { + nr = vcpu->arch.exception.nr; + exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; + + if (vcpu->arch.exception.has_error_code) { + exit_int_info |= SVM_EVTINJ_VALID_ERR; + nested_vmcb->control.exit_int_info_err = + vcpu->arch.exception.error_code; + } + + } else if (vcpu->arch.nmi_injected) { + exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + } else if (vcpu->arch.interrupt.injected) { + nr = vcpu->arch.interrupt.nr; + exit_int_info = nr | SVM_EVTINJ_VALID; + + if (vcpu->arch.interrupt.soft) + exit_int_info |= SVM_EVTINJ_TYPE_SOFT; + else + exit_int_info |= SVM_EVTINJ_TYPE_INTR; + } + + nested_vmcb->control.exit_int_info = exit_int_info; +} + +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +{ /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; svm->vmcb->save.cs = nested_vmcb->save.cs; @@ -249,14 +323,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = nested_vmcb->save.cr3; - svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); @@ -270,38 +337,34 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->save.dr7 = nested_vmcb->save.dr7; svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; +} - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; - svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; +static void nested_prepare_vmcb_control(struct vcpu_svm *svm) +{ + const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) + nested_svm_init_mmu_context(&svm->vcpu); - /* cache intercepts */ - svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; - svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; - svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; - svm->nested.intercept = nested_vmcb->control.intercept; + /* Guest paging mode is active - reset mmu */ + kvm_mmu_reset_context(&svm->vcpu); - svm_flush_tlb(&svm->vcpu, true); - svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; - if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) - svm->vcpu.arch.hflags |= HF_VINTR_MASK; - else - svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + svm_flush_tlb(&svm->vcpu); - svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = + svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; - svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; - svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; - svm->vmcb->control.int_state = nested_vmcb->control.int_state; - svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; - svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + svm->vmcb->control.int_ctl = + (svm->nested.ctl.int_ctl & ~mask) | + (svm->nested.hsave->control.int_ctl & mask); - svm->vmcb->control.pause_filter_count = - nested_vmcb->control.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = - nested_vmcb->control.pause_filter_thresh; + svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; + svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; + svm->vmcb->control.int_state = svm->nested.ctl.int_state; + svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; + svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; - kvm_vcpu_unmap(&svm->vcpu, map, true); + svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; + svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; /* Enter Guest-Mode */ enter_guest_mode(&svm->vcpu); @@ -312,25 +375,18 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, */ recalc_intercepts(svm); - svm->nested.vmcb = vmcb_gpa; + mark_all_dirty(svm->vmcb); +} - /* - * If L1 had a pending IRQ/NMI before executing VMRUN, - * which wasn't delivered because it was disallowed (e.g. - * interrupts disabled), L0 needs to evaluate if this pending - * event should cause an exit from L2 to L1 or be delivered - * directly to L2. - * - * Usually this would be handled by the processor noticing an - * IRQ/NMI window request. However, VMRUN can unblock interrupts - * by implicitly setting GIF, so force L0 to perform pending event - * evaluation by requesting a KVM_REQ_EVENT. - */ - enable_gif(svm); - if (unlikely(evaluate_pending_interrupts)) - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); +void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb) +{ + svm->nested.vmcb = vmcb_gpa; + load_nested_vmcb_control(svm, &nested_vmcb->control); + nested_prepare_vmcb_save(svm, nested_vmcb); + nested_prepare_vmcb_control(svm); - mark_all_dirty(svm->vmcb); + svm_set_gif(svm, true); } int nested_svm_vmrun(struct vcpu_svm *svm) @@ -342,8 +398,12 @@ int nested_svm_vmrun(struct vcpu_svm *svm) struct kvm_host_map map; u64 vmcb_gpa; - vmcb_gpa = svm->vmcb->save.rax; + if (is_smm(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + vmcb_gpa = svm->vmcb->save.rax; ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); @@ -361,10 +421,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_code_hi = 0; nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - - kvm_vcpu_unmap(&svm->vcpu, &map, true); - - return ret; + goto out; } trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, @@ -404,9 +461,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm) else hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - copy_vmcb_control_area(hsave, vmcb); + copy_vmcb_control_area(&hsave->control, &vmcb->control); - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); + svm->nested.nested_run_pending = 1; + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); if (!nested_svm_vmrun_msrpm(svm)) { svm->vmcb->control.exit_code = SVM_EXIT_ERR; @@ -417,6 +475,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_svm_vmexit(svm); } +out: + kvm_vcpu_unmap(&svm->vcpu, &map, true); + return ret; } @@ -444,13 +505,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, - vmcb->control.exit_info_1, - vmcb->control.exit_info_2, - vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); if (rc) { if (rc == -EINVAL) @@ -463,9 +517,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); svm->nested.vmcb = 0; + WARN_ON_ONCE(svm->nested.nested_run_pending); + + /* in case we halted in L2 */ + svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; /* Give the current vmcb to the guest */ - disable_gif(svm); + svm_set_gif(svm, false); nested_vmcb->save.es = vmcb->save.es; nested_vmcb->save.cs = vmcb->save.cs; @@ -479,62 +537,42 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.cr2 = vmcb->save.cr2; nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = vmcb->save.rip; - nested_vmcb->save.rsp = vmcb->save.rsp; - nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu); + nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu); + nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu); nested_vmcb->save.dr7 = vmcb->save.dr7; nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; nested_vmcb->save.cpl = vmcb->save.cpl; - nested_vmcb->control.int_ctl = vmcb->control.int_ctl; - nested_vmcb->control.int_vector = vmcb->control.int_vector; nested_vmcb->control.int_state = vmcb->control.int_state; nested_vmcb->control.exit_code = vmcb->control.exit_code; nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; - nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + + if (nested_vmcb->control.exit_code != SVM_EXIT_ERR) + nested_vmcb_save_pending_event(svm, nested_vmcb); if (svm->nrips_enabled) nested_vmcb->control.next_rip = vmcb->control.next_rip; - /* - * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have - * to make sure that we do not lose injected events. So check event_inj - * here and copy it to exit_int_info if it is valid. - * Exit_int_info and event_inj can't be both valid because the case - * below only happens on a VMRUN instruction intercept which has - * no valid exit_int_info set. - */ - if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { - struct vmcb_control_area *nc = &nested_vmcb->control; - - nc->exit_int_info = vmcb->control.event_inj; - nc->exit_int_info_err = vmcb->control.event_inj_err; - } - - nested_vmcb->control.tlb_ctl = 0; - nested_vmcb->control.event_inj = 0; - nested_vmcb->control.event_inj_err = 0; + nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; + nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; + nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; nested_vmcb->control.pause_filter_count = svm->vmcb->control.pause_filter_count; nested_vmcb->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; - /* We always set V_INTR_MASKING and remember the old value in hflags */ - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - /* Restore the original control entries */ - copy_vmcb_control_area(vmcb, hsave); + copy_vmcb_control_area(&vmcb->control, &hsave->control); - svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = + svm->vcpu.arch.l1_tsc_offset; - svm->nested.nested_cr3 = 0; + svm->nested.ctl.nested_cr3 = 0; /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; @@ -562,6 +600,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) mark_all_dirty(svm->vmcb); + trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, + nested_vmcb->control.exit_info_1, + nested_vmcb->control.exit_info_2, + nested_vmcb->control.exit_int_info, + nested_vmcb->control.exit_int_info_err, + KVM_ISA_SVM); + kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); @@ -579,12 +624,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +void svm_leave_nested(struct vcpu_svm *svm) +{ + if (is_guest_mode(&svm->vcpu)) { + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + + svm->nested.nested_run_pending = 0; + leave_guest_mode(&svm->vcpu); + copy_vmcb_control_area(&vmcb->control, &hsave->control); + nested_svm_uninit_mmu_context(&svm->vcpu); + } +} + static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { u32 offset, msr, value; int write, mask; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -598,56 +659,12 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) /* Offset is in 32 bit units but need in 8 bit units */ offset *= 4; - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4)) return NESTED_EXIT_DONE; return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } -/* DB exceptions for our internal use must not cause vmexit */ -static int nested_svm_intercept_db(struct vcpu_svm *svm) -{ - unsigned long dr6 = svm->vmcb->save.dr6; - - /* Always catch it and pass it to userspace if debugging. */ - if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return NESTED_EXIT_HOST; - - /* if we're not singlestepping, it's not ours */ - if (!svm->nmi_singlestep) - goto reflected_db; - - /* if it's not a singlestep exception, it's not ours */ - if (!(dr6 & DR6_BS)) - goto reflected_db; - - /* if the guest is singlestepping, it should get the vmexit */ - if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { - disable_nmi_singlestep(svm); - goto reflected_db; - } - - /* it's ours, the nested hypervisor must not see this one */ - return NESTED_EXIT_HOST; - -reflected_db: - /* - * Synchronize guest DR6 here just like in kvm_deliver_exception_payload; - * it will be moved into the nested VMCB by nested_svm_vmexit. Once - * exceptions will be moved to svm_check_nested_events, all this stuff - * will just go away and we could just return NESTED_EXIT_HOST - * unconditionally. db_interception will queue the exception, which - * will be processed by svm_check_nested_events if a nested vmexit is - * required, and we will just use kvm_deliver_exception_payload to copy - * the payload to DR6 before vmexit. - */ - WARN_ON(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT); - svm->vcpu.arch.dr6 &= ~(DR_TRAP_BITS | DR6_RTM); - svm->vcpu.arch.dr6 |= dr6 & ~DR6_FIXED_1; - return NESTED_EXIT_DONE; -} - static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { unsigned port, size, iopm_len; @@ -655,13 +672,13 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - gpa = svm->nested.vmcb_iopm + (port / 8); + gpa = svm->nested.ctl.iopm_base_pa + (port / 8); start_bit = port % 8; iopm_len = (start_bit + size > 8) ? 2 : 1; mask = (0xf >> (4 - size)) << start_bit; @@ -687,31 +704,23 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.intercept_cr & bit) + if (svm->nested.ctl.intercept_cr & bit) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.intercept_dr & bit) + if (svm->nested.ctl.intercept_dr & bit) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { - u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) { - if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) - vmexit = nested_svm_intercept_db(svm); - else if (exit_code == SVM_EXIT_EXCP_BASE + BP_VECTOR && - svm->vcpu.guest_debug & KVM_GUESTDBG_USE_SW_BP) - vmexit = NESTED_EXIT_HOST; - else - vmexit = NESTED_EXIT_DONE; - } - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->vcpu.arch.exception.nested_apf != 0) - vmexit = NESTED_EXIT_DONE; + /* + * Host-intercepted exceptions have been checked already in + * nested_svm_exit_special. There is nothing to do here, + * the vmexit is injected by svm_check_nested_events. + */ + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_ERR: { @@ -720,7 +729,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.intercept & exit_bits) + if (svm->nested.ctl.intercept & exit_bits) vmexit = NESTED_EXIT_DONE; } } @@ -756,62 +765,140 @@ int nested_svm_check_permissions(struct vcpu_svm *svm) return 0; } -int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code) +static bool nested_exit_on_exception(struct vcpu_svm *svm) { - int vmexit; + unsigned int nr = svm->vcpu.arch.exception.nr; - if (!is_guest_mode(&svm->vcpu)) - return 0; + return (svm->nested.ctl.intercept_exceptions & (1 << nr)); +} - vmexit = nested_svm_intercept(svm); - if (vmexit != NESTED_EXIT_DONE) - return 0; +static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) +{ + unsigned int nr = svm->vcpu.arch.exception.nr; svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; + + if (svm->vcpu.arch.exception.has_error_code) + svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; /* * EXITINFO2 is undefined for all exception intercepts other * than #PF. */ - if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; - else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + if (nr == PF_VECTOR) { + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else if (svm->vcpu.arch.exception.has_payload) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + } else if (nr == DB_VECTOR) { + /* See inject_pending_event. */ + kvm_deliver_exception_payload(&svm->vcpu); + if (svm->vcpu.arch.dr7 & DR7_GD) { + svm->vcpu.arch.dr7 &= ~DR7_GD; + kvm_update_dr7(&svm->vcpu); + } + } else + WARN_ON(svm->vcpu.arch.exception.has_payload); - svm->nested.exit_required = true; - return vmexit; + nested_svm_vmexit(svm); +} + +static void nested_svm_smi(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_SMI; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); +} + +static void nested_svm_nmi(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_NMI; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); } static void nested_svm_intr(struct vcpu_svm *svm) { + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; - /* nested_svm_vmexit this gets called afterwards from handle_exit */ - svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + nested_svm_vmexit(svm); } -static bool nested_exit_on_intr(struct vcpu_svm *svm) +static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return (svm->nested.intercept & 1ULL); + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT)); +} + +static void nested_svm_init(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_INIT; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); } -int svm_check_nested_events(struct kvm_vcpu *vcpu) + +static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool block_nested_events = - kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required; + kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending; + struct kvm_lapic *apic = vcpu->arch.apic; + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_INIT, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_init(svm)) + return 0; + nested_svm_init(svm); + return 0; + } + + if (vcpu->arch.exception.pending) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_exception(svm)) + return 0; + nested_svm_inject_exception_vmexit(svm); + return 0; + } + + if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_smi(svm)) + return 0; + nested_svm_smi(svm); + return 0; + } + + if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_nmi(svm)) + return 0; + nested_svm_nmi(svm); + return 0; + } - if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) { + if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_exit_on_intr(svm)) + return 0; nested_svm_intr(svm); return 0; } @@ -826,21 +913,170 @@ int nested_svm_exit_special(struct vcpu_svm *svm) switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: - case SVM_EXIT_EXCP_BASE + MC_VECTOR: - return NESTED_EXIT_HOST; case SVM_EXIT_NPF: - /* For now we are always handling NPFs when using them */ - if (npt_enabled) + return NESTED_EXIT_HOST; + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + + if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) return NESTED_EXIT_HOST; - break; - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) + else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && + svm->vcpu.arch.apf.host_apf_flags) + /* Trap async PF even if not shadowing */ return NESTED_EXIT_HOST; break; + } default: break; } return NESTED_EXIT_CONTINUE; } + +static int svm_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_svm *svm; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = KVM_STATE_NESTED_FORMAT_SVM, + .size = sizeof(kvm_state), + }; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + &user_kvm_nested_state->data.svm[0]; + + if (!vcpu) + return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE; + + svm = to_svm(vcpu); + + if (user_data_size < kvm_state.size) + goto out; + + /* First fill in the header and copy it out. */ + if (is_guest_mode(vcpu)) { + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (svm->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + + if (gif_set(svm)) + kvm_state.flags |= KVM_STATE_NESTED_GIF_SET; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (!is_guest_mode(vcpu)) + goto out; + + /* + * Copy over the full size of the VMCB rather than just the size + * of the structs. + */ + if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) + return -EFAULT; + if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, + sizeof(user_vmcb->control))) + return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save, + sizeof(user_vmcb->save))) + return -EFAULT; + +out: + return kvm_state.size; +} + +static int svm_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *hsave = svm->nested.hsave; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + &user_kvm_nested_state->data.svm[0]; + struct vmcb_control_area ctl; + struct vmcb_save_area save; + u32 cr0; + + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) + return -EINVAL; + + if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GIF_SET)) + return -EINVAL; + + /* + * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's + * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed. + */ + if (!(vcpu->arch.efer & EFER_SVME)) { + /* GIF=1 and no guest mode are required if SVME=0. */ + if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET) + return -EINVAL; + } + + /* SMM temporarily disables SVM, so we cannot be in guest mode. */ + if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { + svm_leave_nested(svm); + goto out_set_gif; + } + + if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) + return -EINVAL; + if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) + return -EINVAL; + if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) + return -EFAULT; + if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) + return -EFAULT; + + if (!nested_vmcb_check_controls(&ctl)) + return -EINVAL; + + /* + * Processor state contains L2 state. Check that it is + * valid for guest mode (see nested_vmcb_checks). + */ + cr0 = kvm_read_cr0(vcpu); + if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) + return -EINVAL; + + /* + * Validate host state saved from before VMRUN (see + * nested_svm_check_permissions). + * TODO: validate reserved bits for all saved state. + */ + if (!(save.cr0 & X86_CR0_PG)) + return -EINVAL; + + /* + * All checks done, we can enter guest mode. L1 control fields + * come from the nested save state. Guest state is already + * in the registers, the save area of the nested state instead + * contains saved L1 state. + */ + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); + hsave->save = save; + + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; + load_nested_vmcb_control(svm, &ctl); + nested_prepare_vmcb_control(svm); + +out_set_gif: + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; +} + +struct kvm_x86_nested_ops svm_nested_ops = { + .check_events = svm_check_nested_events, + .get_state = svm_get_nested_state, + .set_state = svm_set_nested_state, +}; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index ce0b10fe5e2b..035da07500e8 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -215,21 +215,22 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } -static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; + u32 msr = msr_info->index; /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - *data = pmc_read_counter(pmc); + msr_info->data = pmc_read_counter(pmc); return 0; } /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - *data = pmc->eventsel; + msr_info->data = pmc->eventsel; return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a862c768fd54..9e333b91ff78 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -33,6 +33,7 @@ #include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/irq_remapping.h> +#include <asm/mce.h> #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> @@ -264,6 +265,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu) void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { + struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.efer = efer; if (!npt_enabled) { @@ -274,8 +276,13 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + } + + svm->vmcb->save.efer = efer | EFER_SVME; + mark_dirty(svm->vmcb, VMCB_CR); } static int is_external_interrupt(u32 info) @@ -318,9 +325,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; } else { - if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) - pr_err("%s: ip 0x%lx next 0x%llx\n", - __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); } svm_set_interrupt_shadow(vcpu, 0); @@ -333,17 +337,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned nr = vcpu->arch.exception.nr; bool has_error_code = vcpu->arch.exception.has_error_code; - bool reinject = vcpu->arch.exception.injected; u32 error_code = vcpu->arch.exception.error_code; - /* - * If we are within a nested VM we'd better #VMEXIT and let the guest - * handle the exception - */ - if (!reinject && - nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - kvm_deliver_exception_payload(&svm->vcpu); if (nr == BP_VECTOR && !nrips) { @@ -780,7 +775,7 @@ static __init void svm_adjust_mmio_mask(void) */ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK); } static void svm_hardware_teardown(void) @@ -890,7 +885,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); + kvm_configure_mmu(npt_enabled, PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -953,16 +948,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (is_guest_mode(vcpu)) - return svm->nested.hsave->control.tsc_offset; - - return vcpu->arch.tsc_offset; -} - static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1208,6 +1193,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->avic_is_running = true; svm->nested.hsave = page_address(hsave_page); + clear_page(svm->nested.hsave); svm->msrpm = page_address(msrpm_pages); svm_vcpu_init_msrpm(svm->msrpm); @@ -1364,12 +1350,13 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static inline void svm_enable_vintr(struct vcpu_svm *svm) +static void svm_set_vintr(struct vcpu_svm *svm) { struct vmcb_control_area *control; /* The following fields are ignored when AVIC is enabled */ WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + set_intercept(svm, INTERCEPT_VINTR); /* * This is just a dummy VINTR to actually cause a vmexit to happen. @@ -1383,18 +1370,19 @@ static inline void svm_enable_vintr(struct vcpu_svm *svm) mark_dirty(svm->vmcb, VMCB_INTR); } -static void svm_set_vintr(struct vcpu_svm *svm) -{ - set_intercept(svm, INTERCEPT_VINTR); - if (is_intercept(svm, INTERCEPT_VINTR)) - svm_enable_vintr(svm); -} - static void svm_clear_vintr(struct vcpu_svm *svm) { + const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; clr_intercept(svm, INTERCEPT_VINTR); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + /* Drop int_ctl fields related to VINTR injection. */ + svm->vmcb->control.int_ctl &= mask; + if (is_guest_mode(&svm->vcpu)) { + WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != + (svm->nested.ctl.int_ctl & V_TPR_MASK)); + svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; + } + mark_dirty(svm->vmcb, VMCB_INTR); } @@ -1533,14 +1521,6 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) mark_dirty(svm->vmcb, VMCB_DT); } -static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ -} - -static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ -} - static void update_cr0_intercept(struct vcpu_svm *svm) { ulong gcr0 = svm->vcpu.arch.cr0; @@ -1603,7 +1583,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu, true); + svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -1842,6 +1822,25 @@ static bool is_erratum_383(void) return true; } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +} + static void svm_handle_mce(struct vcpu_svm *svm) { if (is_erratum_383()) { @@ -1860,11 +1859,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. */ - asm volatile ( - "int $0x12\n"); - /* not sure if we ever come back to this point */ - - return; + kvm_machine_check(); } static int mc_interception(struct vcpu_svm *svm) @@ -1993,6 +1988,38 @@ static int vmrun_interception(struct vcpu_svm *svm) return nested_svm_vmrun(svm); } +void svm_set_gif(struct vcpu_svm *svm, bool value) +{ + if (value) { + /* + * If VGIF is enabled, the STGI intercept is only added to + * detect the opening of the SMI/NMI window; remove it now. + * Likewise, clear the VINTR intercept, we will set it + * again while processing KVM_REQ_EVENT if needed. + */ + if (vgif_enabled(svm)) + clr_intercept(svm, INTERCEPT_STGI); + if (is_intercept(svm, SVM_EXIT_VINTR)) + svm_clear_vintr(svm); + + enable_gif(svm); + if (svm->vcpu.arch.smi_pending || + svm->vcpu.arch.nmi_pending || + kvm_cpu_has_injectable_intr(&svm->vcpu)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } else { + disable_gif(svm); + + /* + * After a CLGI no interrupts should come. But if vGIF is + * in use, we still rely on the VINTR intercept (rather than + * STGI) to detect an open interrupt window. + */ + if (!vgif_enabled(svm)) + svm_clear_vintr(svm); + } +} + static int stgi_interception(struct vcpu_svm *svm) { int ret; @@ -2000,18 +2027,8 @@ static int stgi_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - /* - * If VGIF is enabled, the STGI intercept is only added to - * detect the opening of the SMI/NMI window; remove it now. - */ - if (vgif_enabled(svm)) - clr_intercept(svm, INTERCEPT_STGI); - ret = kvm_skip_emulated_instruction(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - - enable_gif(svm); - + svm_set_gif(svm, true); return ret; } @@ -2023,13 +2040,7 @@ static int clgi_interception(struct vcpu_svm *svm) return 1; ret = kvm_skip_emulated_instruction(&svm->vcpu); - - disable_gif(svm); - - /* After a CLGI no interrupts should come */ - if (!kvm_vcpu_apicv_active(&svm->vcpu)) - svm_clear_vintr(svm); - + svm_set_gif(svm, false); return ret; } @@ -2193,7 +2204,7 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, bool ret = false; u64 intercept; - intercept = svm->nested.intercept; + intercept = svm->nested.ctl.intercept; if (!is_guest_mode(&svm->vcpu) || (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) @@ -2671,8 +2682,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) */ svm_toggle_avic_for_irq_window(&svm->vcpu, true); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; return 1; } @@ -2898,8 +2907,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = control->exit_info_2; } -static int handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion exit_fastpath) +static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -2912,12 +2920,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (unlikely(svm->nested.exit_required)) { - nested_svm_vmexit(svm); - svm->nested.exit_required = false; - - return 1; - } + svm_complete_interrupts(svm); if (is_guest_mode(vcpu)) { int vmexit; @@ -2938,8 +2941,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, return 1; } - svm_complete_interrupts(svm); - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2957,10 +2958,10 @@ static int handle_exit(struct kvm_vcpu *vcpu, __func__, svm->vmcb->control.exit_int_info, exit_code); - if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { - kvm_skip_emulated_instruction(vcpu); + if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); dump_vmcb(vcpu); @@ -3049,18 +3050,37 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } -static int svm_nmi_allowed(struct kvm_vcpu *vcpu) +bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int ret; - ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - !(svm->vcpu.arch.hflags & HF_NMI_MASK); - ret = ret && gif_set(svm) && nested_svm_nmi(svm); + bool ret; + + if (!gif_set(svm)) + return true; + + if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) + return false; + + ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + (svm->vcpu.arch.hflags & HF_NMI_MASK); return ret; } +static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) + return -EBUSY; + + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) + return -EBUSY; + + return !svm_nmi_blocked(vcpu); +} + static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3081,19 +3101,46 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) +bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!gif_set(svm) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) - return 0; + if (!gif_set(svm)) + return true; - if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return !!(svm->vcpu.arch.hflags & HF_HIF_MASK); - else - return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); + if (is_guest_mode(vcpu)) { + /* As long as interrupts are being delivered... */ + if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) + ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) + : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + return true; + + /* ... vmexits aren't blocked by the interrupt shadow */ + if (nested_exit_on_intr(svm)) + return false; + } else { + if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + return true; + } + + return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); +} + +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) + return -EBUSY; + + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) + return -EBUSY; + + return !svm_interrupt_blocked(vcpu); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@ -3134,9 +3181,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; /* STGI will cause a vm exit */ } - if (svm->nested.exit_required) - return; /* we're not going to run the guest yet */ - /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) @@ -3156,10 +3200,17 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return 0; } -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +void svm_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * Flush only the current ASID even if the TLB flush was invoked via + * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all + * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and + * unconditionally does a TLB flush on both nested VM-Enter and nested + * VM-Exit (via kvm_mmu_reset_context()). + */ if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; else @@ -3279,10 +3330,21 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } +static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + + return EXIT_FASTPATH_NONE; +} + void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); -static void svm_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { + fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3290,13 +3352,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; /* - * A vmexit emulation is required before the vcpu can be executed - * again. - */ - if (unlikely(svm->nested.exit_required)) - return; - - /* * Disable singlestep if we're injecting an interrupt/exception. * We don't want our modified rflags to be pushed on the stack where * we might not be able to easily reset them if we disabled NMI @@ -3387,6 +3442,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ + exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@ -3394,12 +3450,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_cr8_to_lapic(vcpu); svm->next_rip = 0; + if (is_guest_mode(&svm->vcpu)) { + sync_nested_vmcb_control(svm); + svm->nested.nested_run_pending = 0; + } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_flags = + kvm_read_and_reset_apf_flags(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); @@ -3415,12 +3476,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); mark_all_clean(svm->vmcb); + return exit_fastpath; } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - bool update_guest_cr3 = true; unsigned long cr3; cr3 = __sme_set(root); @@ -3429,18 +3490,13 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ - if (is_guest_mode(vcpu)) - update_guest_cr3 = false; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - cr3 = vcpu->arch.cr3; - else /* CR3 is already up-to-date. */ - update_guest_cr3 = false; + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + return; + cr3 = vcpu->arch.cr3; } - if (update_guest_cr3) { - svm->vmcb->save.cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_CR); - } + svm->vmcb->save.cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_CR); } static int is_disabled(void) @@ -3475,7 +3531,7 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_emulated_msr(int index) +static bool svm_has_emulated_msr(u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: @@ -3628,7 +3684,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - intercept = svm->nested.intercept; + intercept = svm->nested.ctl.intercept; if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) break; @@ -3716,13 +3772,8 @@ out: return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) @@ -3737,23 +3788,28 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } -static int svm_smi_allowed(struct kvm_vcpu *vcpu) +bool svm_smi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); /* Per APM Vol.2 15.22.2 "Response to SMI" */ if (!gif_set(svm)) - return 0; + return true; - if (is_guest_mode(&svm->vcpu) && - svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { - /* TODO: Might need to set exit_info_1 and exit_info_2 here */ - svm->vmcb->control.exit_code = SVM_EXIT_SMI; - svm->nested.exit_required = true; - return 0; - } + return is_smm(vcpu); +} - return 1; +static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) + return -EBUSY; + + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) + return -EBUSY; + + return !svm_smi_blocked(vcpu); } static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -3793,12 +3849,13 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; nested_vmcb = map.hva; - enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map); + enter_svm_guest_mode(svm, vmcb, nested_vmcb); + kvm_vcpu_unmap(&svm->vcpu, &map, true); } return 0; } -static int enable_smi_window(struct kvm_vcpu *vcpu) +static void enable_smi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3806,9 +3863,9 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) if (vgif_enabled(svm)) set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ - return 1; + } else { + /* We must be in SMM; RSM will cause a vmexit anyway. */ } - return 0; } static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) @@ -3819,6 +3876,13 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) bool is_user = svm_get_cpl(vcpu) == 3; /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: @@ -3876,9 +3940,9 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) /* * TODO: Last condition latch INIT signals on vCPU when * vCPU is in guest-mode and vmcb12 defines intercept on INIT. - * To properly emulate the INIT intercept, SVM should implement - * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit() - * there if an INIT signal is pending. + * To properly emulate the INIT intercept, + * svm_check_nested_events() should call nested_svm_vmexit() + * if an INIT signal is pending. */ return !gif_set(svm) || (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); @@ -3932,8 +3996,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, - .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -3947,8 +4009,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .tlb_flush = svm_flush_tlb, + .tlb_flush_all = svm_flush_tlb, + .tlb_flush_current = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, + .tlb_flush_guest = svm_flush_tlb, .run = svm_vcpu_run, .handle_exit = handle_exit, @@ -3989,7 +4053,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_l1_tsc_offset = svm_write_l1_tsc_offset, .load_mmu_pgd = svm_load_mmu_pgd, @@ -4002,6 +4065,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, + .nested_ops = &svm_nested_ops, + .deliver_posted_interrupt = svm_deliver_avic_intr, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, @@ -4016,14 +4081,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .apic_init_signal_blocked = svm_apic_init_signal_blocked, - - .check_nested_events = svm_check_nested_events, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index df3474f4fb02..6ac4c00a5d82 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -86,25 +86,17 @@ struct nested_state { u64 hsave_msr; u64 vm_cr_msr; u64 vmcb; + u32 host_intercept_exceptions; /* These are the merged vectors */ u32 *msrpm; - /* gpa pointers to the real vectors */ - u64 vmcb_msrpm; - u64 vmcb_iopm; + /* A VMRUN has started but has not yet been performed, so + * we cannot inject a nested vmexit yet. */ + bool nested_run_pending; - /* A VMEXIT is required but not yet emulated */ - bool exit_required; - - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - - /* Nested Paging related state */ - u64 nested_cr3; + /* cache for control fields of the guest */ + struct vmcb_control_area ctl; }; struct vcpu_svm { @@ -360,8 +352,12 @@ u32 svm_msrpm_offset(u32 msr); void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); +void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); +bool svm_smi_blocked(struct kvm_vcpu *vcpu); +bool svm_nmi_blocked(struct kvm_vcpu *vcpu); +bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); +void svm_set_gif(struct vcpu_svm *svm, bool value); /* nested.c */ @@ -369,28 +365,31 @@ void disable_nmi_singlestep(struct vcpu_svm *svm); #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -/* This function returns true if it is save to enable the nmi window */ -static inline bool nested_svm_nmi(struct vcpu_svm *svm) +static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(&svm->vcpu)) - return true; + struct vcpu_svm *svm = to_svm(vcpu); - if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) - return true; + return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK); +} - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->nested.exit_required = true; +static inline bool nested_exit_on_smi(struct vcpu_svm *svm) +{ + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI)); +} - return false; +static inline bool nested_exit_on_intr(struct vcpu_svm *svm) +{ + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR)); } -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); } void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map); + struct vmcb *nested_vmcb); +void svm_leave_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); @@ -398,8 +397,10 @@ int nested_svm_exit_handled(struct vcpu_svm *svm); int nested_svm_check_permissions(struct vcpu_svm *svm); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -int svm_check_nested_events(struct kvm_vcpu *vcpu); int nested_svm_exit_special(struct vcpu_svm *svm); +void sync_nested_vmcb_control(struct vcpu_svm *svm); + +extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 249062f24b94..b66432b015d2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -225,6 +225,14 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 +#define kvm_print_exit_reason(exit_reason, isa) \ + (isa == KVM_ISA_VMX) ? \ + __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ + __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ + (isa == KVM_ISA_VMX) ? \ + __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + /* * Tracepoint for kvm guest exit: */ @@ -250,12 +258,10 @@ TRACE_EVENT(kvm_exit, &__entry->info2); ), - TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx", + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", __entry->vcpu_id, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), - __entry->guest_rip, __entry->info1, __entry->info2) + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), + __entry->guest_rip, __entry->info1, __entry->info2) ); /* @@ -588,12 +594,10 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info_err = exit_int_info_err; __entry->isa = isa; ), - TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " + TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), + kvm_print_exit_reason(__entry->exit_code, __entry->isa), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); @@ -626,13 +630,11 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->isa = isa; ), - TP_printk("reason: %s ext_inf1: 0x%016llx " + TP_printk("reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) + kvm_print_exit_reason(__entry->exit_code, __entry->isa), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) ); /* @@ -1539,6 +1541,57 @@ TRACE_EVENT(kvm_nested_vmenter_failed, __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); +/* + * Tracepoint for syndbg_set_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_set_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); + +/* + * Tracepoint for syndbg_get_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_get_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8903475f751e..4bbd8b448d22 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -18,6 +18,8 @@ extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define PMU_CAP_FW_WRITES (1ULL << 13) + struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We @@ -367,4 +369,13 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline u64 vmx_get_perf_capabilities(void) +{ + /* + * Since counters are virtualized, KVM would support full + * width counting unconditionally, even if the host lacks it. + */ + return PMU_CAP_FW_WRITES; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 303813423c3e..e5325bd0f304 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -4,6 +4,7 @@ #include <linux/smp.h> #include "../hyperv.h" +#include "../cpuid.h" #include "evmcs.h" #include "vmcs.h" #include "vmx.h" @@ -160,14 +161,6 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), /* 32 bit rw */ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, @@ -334,17 +327,18 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * vmcs_version represents the range of supported Enlightened VMCS - * versions: lower 8 bits is the minimal version, higher 8 bits is the - * maximum supported version. KVM supports versions from 1 to - * KVM_EVMCS_VERSION. - */ - if (vmx->nested.enlightened_vmcs_enabled) - return (KVM_EVMCS_VERSION << 8) | 1; - - return 0; + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (kvm_cpu_cap_get(X86_FEATURE_VMX) && + vmx->nested.enlightened_vmcs_enabled) + return (KVM_EVMCS_VERSION << 8) | 1; + + return 0; } void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e44f33c82332..9c74a732b08d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -303,11 +303,11 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, prev); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vmx_segment_cache_clear(vmx); + vmx_register_cache_reset(vcpu); } /* @@ -328,19 +328,19 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason; + u32 vm_exit_reason; unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { - exit_reason = EXIT_REASON_PML_FULL; + vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; } else if (fault->error_code & PFERR_RSVD_MASK) - exit_reason = EXIT_REASON_EPT_MISCONFIG; + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; else - exit_reason = EXIT_REASON_EPT_VIOLATION; + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); + nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -437,11 +437,6 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, } } -static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -698,11 +693,6 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) VM_EXIT_ACK_INTR_ON_EXIT; } -static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) -{ - return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); -} - static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -927,6 +917,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) } return 0; fail: + /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ return i + 1; } @@ -1074,34 +1065,81 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) } /* + * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. + * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't + * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). + * Here's why. + * + * If EPT is enabled by L0 a sync is never needed: + * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there + * cannot be unsync'd SPTEs for either L1 or L2. + * + * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter + * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings + * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush + * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't + * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. + * + * If EPT is disabled by L0: + * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 + * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't + * required to invalidate linear mappings (EPT is disabled so there are + * no combined or guest-physical mappings), i.e. L1 can't rely on the + * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). + * + * - however if VPID is disabled by L1, then a sync is needed as L1 expects all + * linear mappings (EPT is disabled so there are no combined or guest-physical + * mappings) to be invalidated on both VM-Enter and VM-Exit. + * + * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which + * additionally checks that L2 has been assigned a VPID (when EPT is disabled). + * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect + * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 + * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has + * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 + * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't + * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush + * stale TLB entries, at which point L0 will sync L2's MMU. + */ +static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) +{ + return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); +} + +/* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, - u32 *entry_failure_code) + enum vm_entry_failure_code *entry_failure_code) { - if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { - if (CC(!nested_cr3_valid(vcpu, cr3))) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return -EINVAL; - } + if (CC(!nested_cr3_valid(vcpu, cr3))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return -EINVAL; + } - /* - * If PAE paging and EPT are both on, CR3 is not used by the CPU and - * must not be dereferenced. - */ - if (is_pae_paging(vcpu) && !nested_ept) { - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { - *entry_failure_code = ENTRY_FAIL_PDPTE; - return -EINVAL; - } + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!nested_ept && is_pae_paging(vcpu) && + (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return -EINVAL; } } + /* + * Unconditionally skip the TLB flush on fast CR3 switch, all TLB + * flushes are handled by nested_vmx_transition_tlb_flush(). See + * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. + */ if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, false); + kvm_mmu_new_pgd(vcpu, cr3, true, + !nested_vmx_transition_mmu_sync(vcpu)); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@ -1132,11 +1170,48 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } -static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + bool is_vmenter) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; + /* + * If VPID is disabled, linear and combined mappings are flushed on + * VM-Enter/VM-Exit, and guest-physical mappings are valid only for + * their associated EPTP. + */ + if (!enable_vpid) + return; + + /* + * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings + * for *all* contexts to be flushed on VM-Enter/VM-Exit. + * + * If VPID is enabled and used by vmc12, but L2 does not have a unique + * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate + * a VPID for L2, flush the current context as the effective ASID is + * common to both L1 and L2. + * + * Defer the flush so that it runs after vmcs02.EPTP has been set by + * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid + * redundant flushes further down the nested pipeline. + * + * If a TLB flush isn't required due to any of the above, and vpid12 is + * changing then the new "virtual" VPID (vpid12) will reuse the same + * "real" VPID (vpid02), and so needs to be sync'd. There is no direct + * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for + * all nested vCPUs. + */ + if (!nested_cpu_has_vpid(vmcs12)) { + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } else if (!nested_has_guest_tlb_tag(vcpu)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } else if (is_vmenter && + vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + vpid_sync_context(nested_get_vpid02(vcpu)); + } } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) @@ -1700,10 +1775,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; - * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; - * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; - * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; - * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; * vmcs12->page_fault_error_code_mask = * evmcs->page_fault_error_code_mask; * vmcs12->page_fault_error_code_match = @@ -1777,10 +1848,6 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; - * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; - * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; - * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; - * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; * evmcs->tpr_threshold = vmcs12->tpr_threshold; * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; * evmcs->exception_bitmap = vmcs12->exception_bitmap; @@ -2020,9 +2087,25 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + + if (!vmx->nested.has_preemption_timer_deadline) { + vmx->nested.preemption_timer_deadline = + vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; + vmx->nested.has_preemption_timer_deadline = true; + } + return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, + u64 preemption_timeout) { - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); /* @@ -2041,7 +2124,8 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); hrtimer_start(&vmx->nested.preemption_timer, - ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); + ktime_add_ns(ktime_get(), preemption_timeout), + HRTIMER_MODE_ABS_PINNED); } static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) @@ -2398,7 +2482,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) + enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; @@ -2447,32 +2531,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); - if (enable_vpid) { - /* - * There is no direct mapping between vpid02 and vpid12, the - * vpid02 is per-vCPU for L0 and reused while the value of - * vpid12 is changed w/ one invvpid during nested vmentry. - * The vpid12 is allocated by L1 for L2, so it will not - * influence global bitmap(for vpid01 and vpid02 allocation) - * even if spawn a lot of nested vCPUs. - */ - if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { - if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { - vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); - } - } else { - /* - * If L1 use EPT, then L0 needs to execute INVEPT on - * EPTP02 instead of EPTP01. Therefore, delay TLB - * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_MMU_PGD. Note that this assumes - * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest(). - */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - } + nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); @@ -2883,11 +2942,11 @@ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *exit_qual) + enum vm_entry_failure_code *entry_failure_code) { bool ia32e; - *exit_qual = ENTRY_FAIL_DEFAULT; + *entry_failure_code = ENTRY_FAIL_DEFAULT; if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) @@ -2902,7 +2961,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { - *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; + *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; return -EINVAL; } @@ -3194,9 +3253,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + enum vm_entry_failure_code entry_failure_code; bool evaluate_pending_interrupts; - u32 exit_reason = EXIT_REASON_INVALID_STATE; - u32 exit_qual; + u32 exit_reason, failed_index; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -3241,24 +3303,33 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_VMFAIL; } - if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, + &entry_failure_code)) { + exit_reason = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; + } } enter_guest_mode(vcpu); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { + exit_reason = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; + } if (from_vmentry) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) + failed_index = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (failed_index) { + exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; + } } else { /* * The MMU is not initialized to point at the right entities yet and @@ -3293,8 +3364,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * the timer. */ vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) { + u64 timer_value = vmx_calc_preemption_timer_value(vcpu); + vmx_start_preemption_timer(vcpu, timer_value); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -3322,7 +3395,6 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; @@ -3403,6 +3475,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; + vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; @@ -3632,6 +3705,12 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) vcpu->arch.exception.payload); } +static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + to_vmx(vcpu)->nested.preemption_timer_expired; +} + static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3661,11 +3740,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) /* * Process any exceptions that are not debug traps before MTF. */ - if (vcpu->arch.exception.pending && - !vmx_pending_dbg_trap(vcpu) && - nested_vmx_check_exception(vcpu, &exit_qual)) { + if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_vmx_check_exception(vcpu, &exit_qual)) + goto no_vmexit; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); return 0; } @@ -3678,25 +3757,34 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending && - nested_vmx_check_exception(vcpu, &exit_qual)) { + if (vcpu->arch.exception.pending) { if (block_nested_events) return -EBUSY; + if (!nested_vmx_check_exception(vcpu, &exit_qual)) + goto no_vmexit; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); return 0; } - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { + if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0; } - if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + if (block_nested_events) + return -EBUSY; + goto no_vmexit; + } + + if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_exit_on_nmi(vcpu)) + goto no_vmexit; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK, 0); @@ -3709,13 +3797,16 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) { + if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_exit_on_intr(vcpu)) + goto no_vmexit; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0; } +no_vmexit: vmx_complete_nested_posted_interrupt(vcpu); return 0; } @@ -3842,12 +3933,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load(&vmx->vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load(&vmx->vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); put_cpu(); } @@ -3876,10 +3967,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -3889,9 +3976,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; if (nested_cpu_has_preemption_timer(vmcs12) && - vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && + !vmx->nested.nested_run_pending) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3939,11 +4027,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * which already writes to vmcs12 directly. */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, + u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { /* update exit information fields: */ - vmcs12->vm_exit_reason = exit_reason; + vmcs12->vm_exit_reason = vm_exit_reason; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -3998,8 +4086,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + enum vm_entry_failure_code ignored; struct kvm_segment seg; - u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -4034,30 +4122,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - /* - * If vmcs01 doesn't use VPID, CPU flushes TLB on every - * VMEntry/VMExit. Thus, no need to flush TLB. - * - * If vmcs12 doesn't use VPID, L1 expects TLB to be - * flushed on every VMEntry/VMExit. - * - * Otherwise, we can preserve TLB entries as long as we are - * able to tag L1 TLB entries differently than L2 TLB entries. - * - * If vmcs12 uses EPT, we need to execute this flush on EPTP01 - * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_MMU_PGD. - */ - if (enable_vpid && - (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } + nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); @@ -4204,7 +4275,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ - if (enable_ept) + if (enable_ept && is_pae_paging(vcpu)) ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); @@ -4272,7 +4343,7 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4281,6 +4352,10 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Service the TLB flush request for L2 before switching to L1. */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) @@ -4292,9 +4367,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); - if (exit_reason != -1) - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); + if (vm_exit_reason != -1) + prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, + exit_intr_info, exit_qualification); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -4344,20 +4419,20 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; - /* - * We are now running in L2, mmu_notifier will force to reload the - * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. - */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + if (vmx->nested.reload_vmcs01_apic_access_page) { + vmx->nested.reload_vmcs01_apic_access_page = false; + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + } - if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + if ((vm_exit_reason != -1) && + (enable_shadow_vmcs || vmx->nested.hv_evmcs)) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); @@ -4365,7 +4440,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - if (exit_reason != -1) + if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, vmcs12->idt_vectoring_info_field, @@ -4554,13 +4629,13 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) gva_t gva; struct x86_exception e; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, sizeof(*vmpointer), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } @@ -4614,7 +4689,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_ABS_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.vpid02 = allocate_vpid(); @@ -4819,7 +4894,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@ -4869,7 +4944,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } } @@ -4905,7 +4980,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@ -4943,7 +5018,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) instr_info, false, len, &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } } @@ -5090,7 +5165,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qual = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; @@ -5108,23 +5183,33 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } return nested_vmx_succeed(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; - unsigned long type; + unsigned long type, roots_to_free; + struct kvm_mmu *mmu; gva_t gva; struct x86_exception e; struct { u64 eptp, gpa; } operand; + int i; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5148,27 +5233,49 @@ static int handle_invept(struct kvm_vcpu *vcpu) /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } - switch (type) { - case VMX_EPT_EXTENT_GLOBAL: - case VMX_EPT_EXTENT_CONTEXT: /* - * TODO: Sync the necessary shadow EPT roots here, rather than - * at the next emulated VM-entry. + * Nested EPT roots are always held through guest_mmu, + * not root_mmu. */ + mmu = &vcpu->arch.guest_mmu; + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if (!nested_vmx_check_eptp(vcpu, operand.eptp)) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + roots_to_free = 0; + if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_CURRENT; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (nested_ept_root_matches(mmu->prev_roots[i].hpa, + mmu->prev_roots[i].pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + break; + case VMX_EPT_EXTENT_GLOBAL: + roots_to_free = KVM_MMU_ROOTS_ALL; break; default: BUG(); break; } + if (roots_to_free) + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + return nested_vmx_succeed(vcpu); } @@ -5208,11 +5315,11 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } if (operand.vpid >> 16) @@ -5226,27 +5333,37 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) is_noncanonical_address(operand.gla, vcpu)) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vpid02, operand.gla); - } else - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_context(vpid02); break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_context(vpid02); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } + /* + * Sync the shadow page tables if EPT is disabled, L1 is invalidating + * linear mappings for L2 (tagged with L2's VPID). Free all roots as + * VPIDs are not tracked in the MMU role. + * + * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share + * an MMU when EPT is disabled. + * + * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. + */ + if (!enable_ept) + kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, + KVM_MMU_ROOTS_ALL); + return nested_vmx_succeed(vcpu); } @@ -5327,8 +5444,8 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) fail: nested_vmx_vmexit(vcpu, vmx->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmx_get_intr_info(vcpu), + vmx_get_exit_qual(vcpu)); return 1; } @@ -5379,7 +5496,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -5433,7 +5550,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsigned long val; @@ -5449,15 +5566,6 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return true; break; case 3: - if ((vmcs12->cr3_target_count >= 1 && - vmcs12->cr3_target_value0 == val) || - (vmcs12->cr3_target_count >= 2 && - vmcs12->cr3_target_value1 == val) || - (vmcs12->cr3_target_count >= 3 && - vmcs12->cr3_target_value2 == val) || - (vmcs12->cr3_target_count >= 4 && - vmcs12->cr3_target_value3 == val)) - return false; if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) return true; break; @@ -5551,49 +5659,85 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) } /* - * Return true if we should exit from L2 to L1 to handle an exit, or false if we - * should handle it ourselves in L0 (and then continue L2). Only call this - * when in is_guest_mode (L2). + * Return true if L0 wants to handle an exit from L2 regardless of whether or not + * L1 wants the exit. Only call this when in is_guest_mode (L2). */ -bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) { - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - WARN_ON_ONCE(vmx->nested.nested_run_pending); - - if (unlikely(vmx->fail)) { - trace_kvm_nested_vmenter_failed( - "hardware VM-instruction error: ", - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; - } - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + u32 intr_info; switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) - return false; + return true; else if (is_page_fault(intr_info)) - return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; + return vcpu->arch.apf.host_apf_flags || !enable_ept; else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; + return true; else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; + return true; + return false; + case EXIT_REASON_EXTERNAL_INTERRUPT: + return true; + case EXIT_REASON_MCE_DURING_VMENTRY: + return true; + case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return true; + case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ + return true; + case EXIT_REASON_PREEMPTION_TIMER: + return true; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return true; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return true; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return true; + default: + break; + } + return false; +} + +/* + * Return 1 if L1 wants to intercept an exit from L2. Only call this when in + * is_guest_mode (L2). + */ +static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 intr_info; + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); + if (is_nmi(intr_info)) + return true; + else if (is_page_fault(intr_info)) + return true; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: - return false; + return nested_exit_on_intr(vcpu); case EXIT_REASON_TRIPLE_FAULT: return true; case EXIT_REASON_INTERRUPT_WINDOW: @@ -5658,7 +5802,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: - return false; + return true; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: @@ -5670,22 +5814,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) * delivery" only come from vmcs12. */ return true; - case EXIT_REASON_EPT_VIOLATION: - /* - * L0 always deals with the EPT violation. If nested EPT is - * used, and the nested mmu code discovers that the address is - * missing in the guest EPT table (EPT12), the EPT violation - * will be injected with nested_ept_inject_page_fault() - */ - return false; - case EXIT_REASON_EPT_MISCONFIG: - /* - * L2 never uses directly L1's EPT, but rather L0's own EPT - * table (shadow on EPT) or a merged EPT table that L0 built - * (EPT on EPT). So any problems with the structure of the - * table is L0's fault. - */ - return false; case EXIT_REASON_INVPCID: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && @@ -5702,17 +5830,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; - case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ - return false; - case EXIT_REASON_VMFUNC: - /* VM functions are emulated through L2->L0 vmexits. */ - return false; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return false; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -5722,6 +5839,67 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) } } +/* + * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was + * reflected into L1. + */ +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; + unsigned long exit_qual; + u32 exit_intr_info; + + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + /* + * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM + * has already loaded L2's state. + */ + if (unlikely(vmx->fail)) { + trace_kvm_nested_vmenter_failed( + "hardware VM-instruction error: ", + vmcs_read32(VM_INSTRUCTION_ERROR)); + exit_intr_info = 0; + exit_qual = 0; + goto reflect_vmexit; + } + + exit_intr_info = vmx_get_intr_info(vcpu); + exit_qual = vmx_get_exit_qual(vcpu); + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, + vmx->idt_vectoring_info, exit_intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + + /* If L0 (KVM) wants the exit, it trumps L1's desires. */ + if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) + return false; + + /* If L1 doesn't want the exit, handle it in L0. */ + if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) + return false; + + /* + * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For + * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would + * need to be synthesized by querying the in-kernel LAPIC, but external + * interrupts are never reflected to L1 so it's a non-issue. + */ + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + +reflect_vmexit: + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); + return true; +} static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, @@ -5733,8 +5911,10 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .flags = 0, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), + .hdr.vmx.flags = 0, .hdr.vmx.vmxon_pa = -1ull, .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; @@ -5776,6 +5956,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx->nested.mtf_pending) kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; + + if (nested_cpu_has_preemption_timer(vmcs12) && + vmx->nested.has_preemption_timer_deadline) { + kvm_state.hdr.vmx.flags |= + KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; + kvm_state.hdr.vmx.preemption_timer_deadline = + vmx->nested.preemption_timer_deadline; + } } } @@ -5821,7 +6009,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } - out: return kvm_state.size; } @@ -5844,7 +6031,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; - u32 exit_qual; + enum vm_entry_failure_code ignored; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; int ret; @@ -5983,9 +6170,15 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; } + if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { + vmx->nested.has_preemption_timer_deadline = true; + vmx->nested.preemption_timer_deadline = + kvm_state->hdr.vmx.preemption_timer_deadline; + } + if (nested_vmx_check_controls(vcpu, vmcs12) || nested_vmx_check_host_state(vcpu, vmcs12) || - nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; @@ -6031,7 +6224,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_reflected() will not pass related exits to L1. + * nested_vmx_l1_wants_exit() will not pass related exits to L1. * These rules have exceptions below. */ @@ -6259,8 +6452,7 @@ void nested_vmx_hardware_unsetup(void) } } -__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, - int (*exit_handlers[])(struct kvm_vcpu *)) +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; @@ -6296,12 +6488,15 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - ops->check_nested_events = vmx_check_nested_events; - ops->get_nested_state = vmx_get_nested_state; - ops->set_nested_state = vmx_set_nested_state; - ops->get_vmcs12_pages = nested_get_vmcs12_pages; - ops->nested_enable_evmcs = nested_enable_evmcs; - ops->nested_get_evmcs_version = nested_get_evmcs_version; - return 0; } + +struct kvm_x86_nested_ops vmx_nested_ops = { + .check_events = vmx_check_nested_events, + .hv_timer_pending = nested_vmx_preemption_timer_pending, + .get_state = vmx_get_nested_state, + .set_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .enable_evmcs = nested_enable_evmcs, + .get_evmcs_version = nested_get_evmcs_version, +}; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index ac56aefa49e3..758bccc26cf9 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -19,14 +19,13 @@ enum nvmx_vmentry_status { void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps); void nested_vmx_hardware_unsetup(void); -__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, - int (*exit_handlers[])(struct kvm_vcpu *)); +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); -bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification); void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); @@ -62,6 +61,13 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs; } +static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; +} + static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ @@ -74,34 +80,6 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Reflect a VM Exit into L1. - */ -static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, - u32 exit_reason) -{ - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* - * At this point, the exit interruption info in exit_intr_info - * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT - * we need to query the in-kernel LAPIC. - */ - WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - } - - nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, - vmcs_readl(EXIT_QUALIFICATION)); - return 1; -} - -/* * Return the cr0 value that a nested guest would read. This is a combination * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by * its hypervisor (cr0_read_shadow). @@ -246,6 +224,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; } +static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); +} + /* * In nested virtualization, check if L1 asked to exit on external interrupts. * For most existing hypervisors, this will always return true. @@ -299,4 +282,6 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid +extern struct kvm_x86_nested_ops vmx_nested_ops; + #endif /* __KVM_X86_VMX_NESTED_H */ diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 19717d0a1100..5f1ac002b4b6 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -268,42 +268,38 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); } -static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) -{ - if (vpid == 0) - return true; - - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); - return true; - } - - return false; -} - static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) return; - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) { - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) vpid_sync_vcpu_single(vpid); - else + else if (vpid != 0) vpid_sync_vcpu_global(); } +static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return; + + if (cpu_has_vmx_invvpid_individual_addr()) + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + else + vpid_sync_context(vpid); +} + static inline void ept_sync_global(void) { __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7c857737b438..d33d890b605f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -18,6 +18,8 @@ #include "nested.h" #include "pmu.h" +#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) + static struct kvm_event_hw_type_mapping intel_arch_events[] = { /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, @@ -150,6 +152,22 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return false; + + return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES; +} + +static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) +{ + if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) + return NULL; + + return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -162,10 +180,13 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; + case MSR_IA32_PERF_CAPABILITIES: + ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr); + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr); break; } @@ -184,35 +205,45 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } -static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; + u32 msr = msr_info->index; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - *data = pmu->fixed_ctr_ctrl; + msr_info->data = pmu->fixed_ctr_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_STATUS: - *data = pmu->global_status; + msr_info->data = pmu->global_status; return 0; case MSR_CORE_PERF_GLOBAL_CTRL: - *data = pmu->global_ctrl; + msr_info->data = pmu->global_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - *data = pmu->global_ovf_ctrl; + msr_info->data = pmu->global_ovf_ctrl; + return 0; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); - *data = val & pmu->counter_bitmask[KVM_PMC_GP]; + msr_info->data = + val & pmu->counter_bitmask[KVM_PMC_GP]; return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { u64 val = pmc_read_counter(pmc); - *data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; + msr_info->data = + val & pmu->counter_bitmask[KVM_PMC_FIXED]; return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - *data = pmc->eventsel; + msr_info->data = pmc->eventsel; return 0; } } @@ -258,9 +289,22 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ? + (data & ~vmx_get_perf_capabilities()) : data) + return 1; + vcpu->arch.perf_capabilities = data; + return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { - if (!msr_info->host_initiated) + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { + if ((msr & MSR_PMC_FULL_WIDTH_BIT) && + (data & ~pmu->counter_bitmask[KVM_PMC_GP])) + return 1; + if (!msr_info->host_initiated && + !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); if (pmc->perf_event) @@ -300,6 +344,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; + vcpu->arch.perf_capabilities = 0; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -312,6 +357,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; perf_get_x86_pmu_capability(&x86_pmu); + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 481ad879197b..5c0ff80b85c0 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -19,7 +19,7 @@ struct vmcs_hdr { struct vmcs { struct vmcs_hdr hdr; u32 abort; - char data[0]; + char data[]; }; DECLARE_PER_CPU(struct vmcs *, current_vmcs); diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 53dfb401316d..c8e51c004f78 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -115,10 +115,6 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), FIELD(CR0_READ_SHADOW, cr0_read_shadow), FIELD(CR4_READ_SHADOW, cr4_read_shadow), - FIELD(CR3_TARGET_VALUE0, cr3_target_value0), - FIELD(CR3_TARGET_VALUE1, cr3_target_value1), - FIELD(CR3_TARGET_VALUE2, cr3_target_value2), - FIELD(CR3_TARGET_VALUE3, cr3_target_value3), FIELD(EXIT_QUALIFICATION, exit_qualification), FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), FIELD(GUEST_CR0, guest_cr0), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index d0c6df373f67..80232daf00ff 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -80,10 +80,7 @@ struct __packed vmcs12 { natural_width cr4_guest_host_mask; natural_width cr0_read_shadow; natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; + natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */ natural_width exit_qualification; natural_width guest_linear_address; natural_width guest_cr0; @@ -263,10 +260,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); CHECK_OFFSET(cr4_read_shadow, 368); - CHECK_OFFSET(cr3_target_value0, 376); - CHECK_OFFSET(cr3_target_value1, 384); - CHECK_OFFSET(cr3_target_value2, 392); - CHECK_OFFSET(cr3_target_value3, 400); + CHECK_OFFSET(dead_space, 376); CHECK_OFFSET(exit_qualification, 408); CHECK_OFFSET(guest_linear_address, 416); CHECK_OFFSET(guest_cr0, 424); diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 51d1a82742fd..e0a182cb3cdd 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -166,13 +166,13 @@ SYM_FUNC_START(__vmx_vcpu_run) mov WORD_SIZE(%_ASM_SP), %_ASM_AX /* Save all guest registers, including RAX from the stack */ - __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) - mov %_ASM_CX, VCPU_RCX(%_ASM_AX) - mov %_ASM_DX, VCPU_RDX(%_ASM_AX) - mov %_ASM_BX, VCPU_RBX(%_ASM_AX) - mov %_ASM_BP, VCPU_RBP(%_ASM_AX) - mov %_ASM_SI, VCPU_RSI(%_ASM_AX) - mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + pop VCPU_RAX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) #ifdef CONFIG_X86_64 mov %r8, VCPU_R8 (%_ASM_AX) mov %r9, VCPU_R9 (%_ASM_AX) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 89c766fad889..170cc76a581f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -437,6 +437,11 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + static unsigned long host_idt_base; /* @@ -1306,10 +1311,12 @@ after_clear_sn: pi_set_on(pi_desc); } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + struct loaded_vmcs *buddy) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; + struct vmcs *prev; if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); @@ -1328,16 +1335,28 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) local_irq_enable(); } - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + prev = per_cpu(current_vmcs, cpu); + if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - indirect_branch_prediction_barrier(); + + /* + * No indirect branch prediction barrier needed when switching + * the active VMCS within a guest, e.g. on nested VM-Enter. + * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + */ + if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + indirect_branch_prediction_barrier(); } if (!already_loaded) { void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; + /* + * Flush all EPTP/VPID contexts, the new pCPU may have stale + * TLB entries from its previous association with the vCPU. + */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* @@ -1364,11 +1383,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); @@ -1546,7 +1565,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rip; + unsigned long rip, orig_rip; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1558,8 +1577,17 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + orig_rip = kvm_rip_read(vcpu); + rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); +#ifdef CONFIG_X86_64 + /* + * We need to mask out the high 32 bits of RIP if not in 64-bit + * mode, but just finding out that we are in 64-bit mode is + * quite expensive. Only do it if there was a carry. + */ + if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) + rip = (u32)rip; +#endif kvm_rip_write(vcpu, rip); } else { if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) @@ -1712,17 +1740,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) - return vcpu->arch.tsc_offset - vmcs12->tsc_offset; - - return vcpu->arch.tsc_offset; -} - static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -1771,6 +1788,9 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + case MSR_IA32_PERF_CAPABILITIES: + msr->data = vmx_get_perf_capabilities(); + return 0; default: return 1; } @@ -1926,6 +1946,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } +static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, + u64 data) +{ +#ifdef CONFIG_X86_64 + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return (u32)data; +#endif + return (unsigned long)data; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -1963,13 +1993,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_eip = data; + } vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_esp = data; + } vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: @@ -2187,6 +2221,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + unsigned long guest_owned_bits; + kvm_register_mark_available(vcpu, reg); switch (reg) { @@ -2200,10 +2236,22 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_ept) ept_save_pdptrs(vcpu); break; + case VCPU_EXREG_CR0: + guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; + break; case VCPU_EXREG_CR3: if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; + case VCPU_EXREG_CR4: + guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; + break; default: WARN_ON_ONCE(1); break; @@ -2837,34 +2885,64 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { - int vpid = to_vmx(vcpu)->vpid; - - if (!vpid_sync_vcpu_addr(vpid, addr)) - vpid_sync_context(vpid); + struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * If VPIDs are not supported or enabled, then the above is a no-op. - * But we don't really need a TLB flush in that case anyway, because - * each VM entry/exit includes an implicit flush when VPID is 0. + * INVEPT must be issued when EPT is enabled, irrespective of VPID, as + * the CPU is not required to invalidate guest-physical mappings on + * VM-Entry, even if VPID is disabled. Guest-physical mappings are + * associated with the root EPT structure and not any particular VPID + * (INVVPID also isn't required to invalidate guest-physical mappings). */ + if (enable_ept) { + ept_sync_global(); + } else if (enable_vpid) { + if (cpu_has_vmx_invvpid_global()) { + vpid_sync_vcpu_global(); + } else { + vpid_sync_vcpu_single(vmx->vpid); + vpid_sync_vcpu_single(vmx->nested.vpid02); + } + } } -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + u64 root_hpa = vcpu->arch.mmu->root_hpa; - vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; + /* No flush required if the current context is invalid. */ + if (!VALID_PAGE(root_hpa)) + return; + + if (enable_ept) + ept_sync_context(construct_eptp(vcpu, root_hpa)); + else if (!is_guest_mode(vcpu)) + vpid_sync_context(to_vmx(vcpu)->vpid); + else + vpid_sync_context(nested_get_vpid02(vcpu)); } -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { - ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + /* + * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vmx_flush_tlb_guest() for an explanation of why this is ok. + */ + vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); +} - vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; +static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + /* + * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 + * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit + * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), + * i.e. no explicit INVVPID is necessary. + */ + vpid_sync_context(to_vmx(vcpu)->vpid); } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -2886,12 +2964,13 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (is_pae_paging(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - } + if (WARN_ON_ONCE(!is_pae_paging(vcpu))) + return; + + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } @@ -2955,20 +3034,27 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); } -static int get_ept_level(struct kvm_vcpu *vcpu) +static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; } +static int get_ept_level(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) + return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); + + return vmx_get_tdp_level(vcpu); +} + u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp = VMX_EPTP_MT_WB; @@ -2983,16 +3069,15 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; - guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(vcpu, cr3); + eptp = construct_eptp(vcpu, pgd); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { @@ -3003,16 +3088,15 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); } - /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */ - if (is_guest_mode(vcpu)) - update_guest_cr3 = false; - else if (!enable_unrestricted_guest && !is_paging(vcpu)) + if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) guest_cr3 = vcpu->arch.cr3; else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; ept_load_pdptrs(vcpu); + } else { + guest_cr3 = pgd; } if (update_guest_cr3) @@ -3063,6 +3147,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; vcpu->arch.cr4 = cr4; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); if (!enable_unrestricted_guest) { if (enable_ept) { @@ -3851,7 +3936,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (vcpu != kvm_get_running_vcpu() && + !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; @@ -4147,8 +4233,7 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE, 0); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); } #define VMX_XSS_EXIT_BITMAP 0 @@ -4453,31 +4538,54 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) +bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; + if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return false; - if (!enable_vnmi && - to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) - return 0; + if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) + return true; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); + return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_NMI)); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) - return false; + return -EBUSY; + + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return -EBUSY; + + return !vmx_nmi_blocked(vcpu); +} +bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - return true; + return false; - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (to_vmx(vcpu)->nested.nested_run_pending) + return -EBUSY; + + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return -EBUSY; + + return !vmx_interrupt_blocked(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4518,10 +4626,8 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) return false; /* fall through */ case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; - /* fall through */ + return !(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: @@ -4616,7 +4722,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) u32 vect_info; vect_info = vmx->idt_vectoring_info; - intr_info = vmx->exit_intr_info; + intr_info = vmx_get_intr_info(vcpu); if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ @@ -4660,9 +4766,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); + cr2 = vmx_get_exit_qual(vcpu); /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } @@ -4673,7 +4779,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) switch (ex_no) { case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); + dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { if (is_icebp(intr_info)) @@ -4740,7 +4846,7 @@ static int handle_io(struct kvm_vcpu *vcpu) int size, in, string; unsigned port; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); string = (exit_qualification & 16) != 0; ++vcpu->stat.io_exits; @@ -4831,7 +4937,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int err; int ret; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { @@ -4908,7 +5014,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) unsigned long exit_qualification; int dr, dr7, reg; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ @@ -5010,7 +5116,7 @@ static int handle_invd(struct kvm_vcpu *vcpu) static int handle_invlpg(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); kvm_mmu_invlpg(vcpu, exit_qualification); return kvm_skip_emulated_instruction(vcpu); @@ -5042,7 +5148,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; @@ -5063,7 +5169,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ @@ -5073,7 +5179,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ @@ -5094,7 +5200,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { @@ -5144,7 +5250,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa_t gpa; u64 error_code; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5216,18 +5322,11 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) bool intr_window_requested; unsigned count = 130; - /* - * We should never reach the point where we are emulating L2 - * due to invalid guest state as that means we incorrectly - * allowed a nested VMEntry with an invalid vmcs12. - */ - WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); - intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { - if (intr_window_requested && vmx_interrupt_allowed(vcpu)) + if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); if (kvm_test_request(KVM_REQ_EVENT, vcpu)) @@ -5404,13 +5503,13 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } @@ -5439,11 +5538,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) if (kvm_get_active_pcid(vcpu) == operand.pcid) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) == operand.pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); @@ -5480,7 +5579,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) trace_kvm_pml_full(vcpu->vcpu_id); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * PML buffer FULL happened while executing iret from NMI, @@ -5499,14 +5598,22 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static int handle_preemption_timer(struct kvm_vcpu *vcpu) +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!vmx->req_immediate_exit && - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) + !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } + + return EXIT_FASTPATH_NONE; +} +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + handle_fastpath_preemption_timer(vcpu); return 1; } @@ -5594,8 +5701,8 @@ static const int kvm_vmx_max_exit_handlers = static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); + *info1 = vmx_get_exit_qual(vcpu); + *info2 = vmx_get_intr_info(vcpu); } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -5677,7 +5784,6 @@ void dump_vmcs(void) u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; unsigned long cr4; u64 efer; - int i, n; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); @@ -5814,14 +5920,6 @@ void dump_vmcs(void) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); - for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); - if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); @@ -5834,15 +5932,12 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion exit_fastpath) +static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before @@ -5853,6 +5948,14 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (enable_pml) vmx_flush_pml_buffer(vcpu); + /* + * We should never reach this point with a pending nested VM-Enter, and + * more specifically emulation of L2 due to invalid guest state (see + * below) should never happen as that means we incorrectly allowed a + * nested VM-Enter with an invalid vmcs12. + */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -5871,8 +5974,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, */ nested_mark_vmcs12_pages_dirty(vcpu); - if (nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); + if (nested_vmx_reflect_vmexit(vcpu)) + return 1; } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { @@ -5919,7 +6022,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) { - if (vmx_interrupt_allowed(vcpu)) { + if (!vmx_interrupt_blocked(vcpu)) { vmx->loaded_vmcs->soft_vnmi_blocked = 0; } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { @@ -5936,10 +6039,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, } } - if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { - kvm_skip_emulated_instruction(vcpu); + if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - } if (exit_reason >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; @@ -6093,7 +6194,15 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (flexpriority_enabled) { sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb(vcpu, true); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + /* + * Flush the TLB, reloading the APIC access page will + * only do so if its physical address has changed, but + * the guest may have inserted a non-APIC mapping into + * the TLB while the APIC access page was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } break; case LAPIC_MODE_X2APIC: @@ -6107,12 +6216,32 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu)) { - vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb(vcpu, true); + struct page *page; + + /* Defer reload until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; + return; } + + if (!(secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return; + + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; + + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + vmx_flush_tlb_current(vcpu); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -6230,16 +6359,16 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) { - vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + if (is_page_fault(intr_info)) { + vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(vmx->exit_intr_info)) { + } else if (is_machine_check(intr_info)) { kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@ -6254,9 +6383,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned long tmp; #endif gate_desc *desc; - u32 intr_info; + u32 intr_info = vmx_get_intr_info(vcpu); - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6269,13 +6397,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) asm volatile( #ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" + "mov %%rsp, %[sp]\n\t" + "and $-16, %%rsp\n\t" + "push %[ss]\n\t" "push %[sp]\n\t" #endif "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" + "push %[cs]\n\t" CALL_NOSPEC : #ifdef CONFIG_X86_64 @@ -6284,7 +6412,9 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) ASM_CALL_CONSTRAINT : [thunk_target]"r"(entry), +#ifdef CONFIG_X86_64 [ss]"i"(__KERNEL_DS), +#endif [cs]"i"(__KERNEL_CS) ); @@ -6292,8 +6422,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6301,12 +6430,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); - else if (!is_guest_mode(vcpu) && - vmx->exit_reason == EXIT_REASON_MSR_WRITE) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } -static bool vmx_has_emulated_msr(int index) +static bool vmx_has_emulated_msr(u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -6337,11 +6463,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + exit_intr_info = vmx_get_intr_info(&vmx->vcpu); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -6508,13 +6631,27 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + switch (to_vmx(vcpu)->exit_reason) { + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); + case EXIT_REASON_PREEMPTION_TIMER: + return handle_fastpath_preemption_timer(vcpu); + default: + return EXIT_FASTPATH_NONE; + } +} + bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { + fastpath_t exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; +reenter_guest: /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) @@ -6523,7 +6660,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) - return; + return EXIT_FASTPATH_NONE; if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -6643,12 +6780,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); - vcpu->arch.regs_dirty = 0; + vmx_register_cache_reset(vcpu); pt_guest_exit(vmx); @@ -6657,18 +6789,45 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; - vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); - if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + if (unlikely(vmx->fail)) { + vmx->exit_reason = 0xdead; + return EXIT_FASTPATH_NONE; + } + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - return; + trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + + if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + exit_fastpath = vmx_exit_handlers_fastpath(vcpu); + if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) { + if (!kvm_vcpu_exit_request(vcpu)) { + /* + * FIXME: this goto should be a loop in vcpu_enter_guest, + * but it would incur the cost of a retpoline for now. + * Revisit once static calls are available. + */ + if (vcpu->arch.apicv_active) + vmx_sync_pir_to_irr(vcpu); + goto reenter_guest; + } + exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; + } + + return exit_fastpath; } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -7138,6 +7297,9 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0x80000001 */ if (!cpu_has_vmx_rdtscp()) kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); + + if (vmx_waitpkg_supported()) + kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -7253,10 +7415,6 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; - if (kvm_mwait_in_guest(vcpu->kvm) || - kvm_can_post_timer_interrupt(vcpu)) - return -EOPNOTSUPP; - vmx = to_vmx(vcpu); tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); @@ -7599,12 +7757,12 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } -static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - return 1; + return -EBUSY; + return !is_smm(vcpu); } static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -7641,9 +7799,9 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 0; } -static int enable_smi_window(struct kvm_vcpu *vcpu) +static void enable_smi_window(struct kvm_vcpu *vcpu) { - return 0; + /* RSM will cause a vmexit anyway. */ } static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) @@ -7656,6 +7814,16 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.vmxon; } +static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; + + if (hrtimer_try_to_cancel(timer) == 1) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } +} + static void hardware_unsetup(void) { if (nested) @@ -7700,8 +7868,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, @@ -7715,8 +7881,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .tlb_flush = vmx_flush_tlb, + .tlb_flush_all = vmx_flush_tlb_all, + .tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_gva = vmx_flush_tlb_gva, + .tlb_flush_guest = vmx_flush_tlb_guest, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, @@ -7751,7 +7919,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = get_ept_level, + .get_tdp_level = vmx_get_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, @@ -7760,7 +7928,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_l1_tsc_offset = vmx_write_l1_tsc_offset, .load_mmu_pgd = vmx_load_mmu_pgd, @@ -7782,6 +7949,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .post_block = vmx_post_block, .pmu_ops = &intel_pmu_ops, + .nested_ops = &vmx_nested_ops, .update_pi_irte = vmx_update_pi_irte, @@ -7797,14 +7965,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .check_nested_events = NULL, - .get_nested_state = NULL, - .set_nested_state = NULL, - .get_vmcs12_pages = NULL, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .migrate_timers = vmx_migrate_timers, }; static __init int hardware_setup(void) @@ -7903,11 +8066,11 @@ static __init int hardware_setup(void) if (!enable_ept) ept_lpage_level = 0; else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PT_PDPE_LEVEL; + ept_lpage_level = PG_LEVEL_1G; else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PT_DIRECTORY_LEVEL; + ept_lpage_level = PG_LEVEL_2M; else - ept_lpage_level = PT_PAGE_TABLE_LEVEL; + ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, ept_lpage_level); /* @@ -7967,8 +8130,7 @@ static __init int hardware_setup(void) nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); - r = nested_vmx_hardware_setup(&vmx_x86_ops, - kvm_vmx_exit_handlers); + r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index aab9df55336e..672c28f17e49 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -8,6 +8,7 @@ #include <asm/intel_pt.h> #include "capabilities.h" +#include "kvm_cache_regs.h" #include "ops.h" #include "vmcs.h" @@ -136,6 +137,7 @@ struct nested_vmx { bool vmcs02_initialized; bool change_vmcs01_virtual_apic_mode; + bool reload_vmcs01_apic_access_page; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -167,6 +169,8 @@ struct nested_vmx { u16 posted_intr_nv; struct hrtimer preemption_timer; + u64 preemption_timer_deadline; + bool has_preemption_timer_deadline; bool preemption_timer_expired; /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ @@ -208,6 +212,7 @@ struct vcpu_vmx { */ bool guest_state_loaded; + unsigned long exit_qualification; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -317,8 +322,8 @@ struct kvm_vmx { }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); -void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + struct loaded_vmcs *buddy); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); @@ -341,6 +346,8 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); +bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); @@ -441,9 +448,18 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) -static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) { - vmx->segment_cache.bitmask = 0; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) + | (1 << VCPU_EXREG_CR0) + | (1 << VCPU_EXREG_CR3) + | (1 << VCPU_EXREG_CR4) + | (1 << VCPU_EXREG_EXIT_INFO_1) + | (1 << VCPU_EXREG_EXIT_INFO_2)); + vcpu->arch.regs_dirty = 0; } static inline u32 vmx_vmentry_ctrl(void) @@ -486,6 +502,28 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + } + return vmx->exit_qualification; +} + +static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + } + return vmx->exit_intr_info; +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -500,24 +538,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, - bool invalidate_gpa) -{ - if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu, - vcpu->arch.mmu->root_hpa)); - } else { - vpid_sync_context(vpid); - } -} - -static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); -} - static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) { vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c17e6eb9ad43..9e41b5135340 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -18,6 +18,7 @@ #include <linux/kvm_host.h> #include "irq.h" +#include "ioapic.h" #include "mmu.h" #include "i8254.h" #include "tss.h" @@ -97,9 +98,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ - #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -194,45 +192,46 @@ u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "req_event", VCPU_STAT(req_event) }, - { "l1d_flush", VCPU_STAT(l1d_flush) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages, .mode = 0444) }, - { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, - { "max_mmu_page_hash_collisions", - VM_STAT(max_mmu_page_hash_collisions) }, + VCPU_STAT("pf_fixed", pf_fixed), + VCPU_STAT("pf_guest", pf_guest), + VCPU_STAT("tlb_flush", tlb_flush), + VCPU_STAT("invlpg", invlpg), + VCPU_STAT("exits", exits), + VCPU_STAT("io_exits", io_exits), + VCPU_STAT("mmio_exits", mmio_exits), + VCPU_STAT("signal_exits", signal_exits), + VCPU_STAT("irq_window", irq_window_exits), + VCPU_STAT("nmi_window", nmi_window_exits), + VCPU_STAT("halt_exits", halt_exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hypercalls", hypercalls), + VCPU_STAT("request_irq", request_irq_exits), + VCPU_STAT("irq_exits", irq_exits), + VCPU_STAT("host_state_reload", host_state_reload), + VCPU_STAT("fpu_reload", fpu_reload), + VCPU_STAT("insn_emulation", insn_emulation), + VCPU_STAT("insn_emulation_fail", insn_emulation_fail), + VCPU_STAT("irq_injections", irq_injections), + VCPU_STAT("nmi_injections", nmi_injections), + VCPU_STAT("req_event", req_event), + VCPU_STAT("l1d_flush", l1d_flush), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), + VM_STAT("mmu_pte_write", mmu_pte_write), + VM_STAT("mmu_pte_updated", mmu_pte_updated), + VM_STAT("mmu_pde_zapped", mmu_pde_zapped), + VM_STAT("mmu_flooded", mmu_flooded), + VM_STAT("mmu_recycled", mmu_recycled), + VM_STAT("mmu_cache_miss", mmu_cache_miss), + VM_STAT("mmu_unsync", mmu_unsync), + VM_STAT("remote_tlb_flush", remote_tlb_flush), + VM_STAT("largepages", lpages, .mode = 0444), + VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), + VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), { NULL } }; @@ -261,7 +260,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } @@ -612,15 +611,28 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu->inject_page_fault(vcpu, fault); + struct kvm_mmu *fault_mmu; + WARN_ON_ONCE(fault->vector != PF_VECTOR); + + fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : + vcpu->arch.walk_mmu; + + /* + * Invalidate the TLB entry for the faulting address, if it exists, + * else the access will fault indefinitely (and to emulate hardware). + */ + if ((fault->error_code & PFERR_PRESENT_MASK) && + !(fault->error_code & PFERR_RSVD_MASK)) + kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, + fault_mmu->root_hpa); + fault_mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; } +EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -1008,7 +1020,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { if (!skip_tlb_flush) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } return 0; } @@ -1020,7 +1032,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); + kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@ -1060,7 +1072,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) } } -static void kvm_update_dr7(struct kvm_vcpu *vcpu) +void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -1073,6 +1085,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -1233,13 +1246,18 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1306,6 +1324,7 @@ static const u32 msr_based_features_all[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; @@ -1564,6 +1583,13 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +{ + return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || + need_resched() || signal_pending(current); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); + /* * The fast path for frequent and performance sensitive wrmsr emulation, * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces @@ -1592,27 +1618,44 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data return 1; } -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) +{ + if (!kvm_can_use_hv_timer(vcpu)) + return 1; + + kvm_set_lapic_tscdeadline_msr(vcpu, data); + return 0; +} + +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - int ret = 0; + fastpath_t ret = EXIT_FASTPATH_NONE; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); - ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_EXIT_HANDLED; + } + break; + case MSR_IA32_TSCDEADLINE: + data = kvm_read_edx_eax(vcpu); + if (!handle_fastpath_set_tscdeadline(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_REENTER_GUEST; + } break; default: - return EXIT_FASTPATH_NONE; + break; } - if (!ret) { + if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); - return EXIT_FASTPATH_SKIP_EMUL_INS; - } - return EXIT_FASTPATH_NONE; + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -1901,7 +1944,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); + u64 curr_offset = vcpu->arch.l1_tsc_offset; vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1943,14 +1986,13 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); - - return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + vcpu->arch.l1_tsc_offset = offset; vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } @@ -2075,7 +2117,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); + u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } @@ -2637,29 +2679,54 @@ out: return r; } +static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; +} + static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 3:5 are reserved, Should be zero */ - if (data & 0x38) + /* Bits 4:5 are reserved, Should be zero */ + if (data & 0x30) return 1; - vcpu->arch.apf.msr_val = data; + vcpu->arch.apf.msr_en_val = data; - if (!(data & KVM_ASYNC_PF_ENABLED)) { + if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) + sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + kvm_async_pf_wakeup_all(vcpu); + + return 0; +} + +static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) +{ + /* Bits 8-63 are reserved */ + if (data >> 8) + return 1; + + if (!lapic_in_kernel(vcpu)) + return 1; + + vcpu->arch.apf.msr_int_val = data; + + vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; + return 0; } @@ -2669,10 +2736,16 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_all(vcpu); +} + +static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush_guest(vcpu); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -2698,7 +2771,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb(vcpu, false); + kvm_vcpu_flush_tlb_guest(vcpu); vcpu->arch.st.preempted = 0; @@ -2875,6 +2948,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_ASYNC_PF_INT: + if (kvm_pv_enable_async_pf_int(vcpu, data)) + return 1; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (data & 0x1) { + vcpu->arch.apf.pageready_pending = false; + kvm_check_async_pf_completion(vcpu); + } + break; case MSR_KVM_STEAL_TIME: if (unlikely(!sched_info_on())) @@ -2932,6 +3015,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3071,7 +3156,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; case MSR_IA32_UCODE_REV: @@ -3149,7 +3234,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; + msr_info->data = vcpu->arch.apf.msr_en_val; + break; + case MSR_KVM_ASYNC_PF_INT: + msr_info->data = vcpu->arch.apf.msr_int_val; + break; + case MSR_KVM_ASYNC_PF_ACK: + msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: msr_info->data = vcpu->arch.st.msr_val; @@ -3187,6 +3278,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3233,7 +3326,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + return kvm_pmu_get_msr(vcpu, msr_info); if (!ignore_msrs) { vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); @@ -3363,6 +3456,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: @@ -3431,14 +3525,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops.get_nested_state ? - kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.nested_ops->get_state ? + kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops.nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; default: break; @@ -4226,9 +4320,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_enable_evmcs) + if (!kvm_x86_ops.nested_ops->enable_evmcs) return -ENOTTY; - r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4543,7 +4637,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops.get_nested_state) + if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4551,8 +4645,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, - user_data_size); + r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, + user_data_size); if (r < 0) break; @@ -4573,7 +4667,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops.set_nested_state) + if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; @@ -4586,7 +4680,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE - | KVM_STATE_NESTED_EVMCS)) + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING + | KVM_STATE_NESTED_GIF_SET)) break; /* nested_run_pending implies guest_mode. */ @@ -4595,7 +4690,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -5242,6 +5337,10 @@ static void kvm_init_msr_list(void) if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) continue; break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + continue; + break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) @@ -5259,7 +5358,7 @@ static void kvm_init_msr_list(void) !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) @@ -5274,7 +5373,7 @@ static void kvm_init_msr_list(void) if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; - } + break; default: break; } @@ -6403,7 +6502,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, &ctxt->exception); + return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, @@ -7669,14 +7768,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu) +static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; + bool can_inject = true; /* try to reinject previous events if any */ - if (vcpu->arch.exception.injected) + if (vcpu->arch.exception.injected) { kvm_x86_ops.queue_exception(vcpu); + can_inject = false; + } /* * Do not inject an NMI or interrupt if there is a pending * exception. Exceptions and interrupts are recognized at @@ -7692,22 +7794,28 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) * fully complete the previous instruction. */ else if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) + if (vcpu->arch.nmi_injected) { kvm_x86_ops.set_nmi(vcpu); - else if (vcpu->arch.interrupt.injected) + can_inject = false; + } else if (vcpu->arch.interrupt.injected) { kvm_x86_ops.set_irq(vcpu); + can_inject = false; + } } + WARN_ON_ONCE(vcpu->arch.exception.injected && + vcpu->arch.exception.pending); + /* * Call check_nested_events() even if we reinjected a previous event * in order for caller to determine if it should require immediate-exit * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { - r = kvm_x86_ops.check_nested_events(vcpu); - if (r != 0) - return r; + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); + if (r < 0) + goto busy; } /* try to inject new event if pending */ @@ -7716,7 +7824,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); - WARN_ON_ONCE(vcpu->arch.exception.injected); vcpu->arch.exception.pending = false; vcpu->arch.exception.injected = true; @@ -7725,16 +7832,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) X86_EFLAGS_RF); if (vcpu->arch.exception.nr == DB_VECTOR) { - /* - * This code assumes that nSVM doesn't use - * check_nested_events(). If it does, the - * DR6/DR7 changes should happen before L1 - * gets a #VMEXIT for an intercepted #DB in - * L2. (Under VMX, on the other hand, the - * DR6/DR7 changes should not happen in the - * event of a VM-exit to L1 for an intercepted - * #DB in L2.) - */ kvm_deliver_exception_payload(vcpu); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; @@ -7743,42 +7840,72 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) } kvm_x86_ops.queue_exception(vcpu); + can_inject = false; } - /* Don't consider new event if we re-injected an event */ - if (kvm_event_needs_reinjection(vcpu)) - return 0; + /* + * Finally, inject interrupt events. If an event cannot be injected + * due to architectural conditions (e.g. IF=0) a window-open exit + * will re-request KVM_REQ_EVENT. Sometimes however an event is pending + * and can architecturally be injected, but we cannot do it right now: + * an interrupt could have arrived just now and we have to inject it + * as a vmexit, or there could already an event in the queue, which is + * indicated by can_inject. In that case we request an immediate exit + * in order to make progress and get back here for another iteration. + * The kvm_x86_ops hooks communicate this by returning -EBUSY. + */ + if (vcpu->arch.smi_pending) { + r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; + enter_smm(vcpu); + can_inject = false; + } else + kvm_x86_ops.enable_smi_window(vcpu); + } - if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops.smi_allowed(vcpu)) { - vcpu->arch.smi_pending = false; - ++vcpu->arch.smi_count; - enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops.set_nmi(vcpu); - } else if (kvm_cpu_has_injectable_intr(vcpu)) { - /* - * Because interrupts can be injected asynchronously, we are - * calling check_nested_events again here to avoid a race condition. - * See https://lkml.org/lkml/2014/7/2/60 for discussion about this - * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? - */ - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { - r = kvm_x86_ops.check_nested_events(vcpu); - if (r != 0) - return r; + if (vcpu->arch.nmi_pending) { + r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops.set_nmi(vcpu); + can_inject = false; + WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0); } - if (kvm_x86_ops.interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - false); + if (vcpu->arch.nmi_pending) + kvm_x86_ops.enable_nmi_window(vcpu); + } + + if (kvm_cpu_has_injectable_intr(vcpu)) { + r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); kvm_x86_ops.set_irq(vcpu); + WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0); } + if (kvm_cpu_has_injectable_intr(vcpu)) + kvm_x86_ops.enable_irq_window(vcpu); } - return 0; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + *req_immediate_exit = true; + + WARN_ON(vcpu->arch.exception.pending); + return; + +busy: + *req_immediate_exit = true; + return; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -8169,24 +8296,13 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; - if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops.set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) - return; - kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); - - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - put_page(page); + kvm_x86_ops.set_apic_access_page_addr(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -8206,13 +8322,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); - enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; + fastpath_t exit_fastpath; bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@ -8234,8 +8350,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu, true); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + kvm_vcpu_flush_tlb_all(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -8308,6 +8433,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); + if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) + kvm_check_async_pf_completion(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -8318,32 +8445,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu) != 0) - req_immediate_exit = true; - else { - /* Enable SMI/NMI/IRQ window open exits if needed. - * - * SMIs have three cases: - * 1) They can be nested, and then there is nothing to - * do here because RSM will cause a vmexit anyway. - * 2) There is an ISA-specific reason why SMI cannot be - * injected, and the moment when this changes can be - * intercepted. - * 3) Or the SMI can be pending because - * inject_pending_event has completed the injection - * of an IRQ or NMI from the previous vmexit, and - * then we request an immediate exit to inject the - * SMI. - */ - if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops.enable_smi_window(vcpu)) - req_immediate_exit = true; - if (vcpu->arch.nmi_pending) - kvm_x86_ops.enable_nmi_window(vcpu); - if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops.enable_irq_window(vcpu); - WARN_ON(vcpu->arch.exception.pending); - } + inject_pending_event(vcpu, &req_immediate_exit); + if (req_int_win) + kvm_x86_ops.enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -8391,8 +8495,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) kvm_x86_ops.sync_pir_to_irr(vcpu); - if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) - || need_resched() || signal_pending(current)) { + if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -8424,7 +8527,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops.run(vcpu); + exit_fastpath = kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -8455,7 +8558,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu); /* * Consume any pending interrupts, including the possible source of @@ -8502,6 +8605,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) return r; cancel_injection: + if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_x86_ops.cancel_injection(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); @@ -8544,8 +8649,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) - kvm_x86_ops.check_nested_events(vcpu); + if (is_guest_mode(vcpu)) + kvm_x86_ops.nested_ops->check_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -8581,8 +8686,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - kvm_check_async_pf_completion(vcpu); - if (signal_pending(current)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; @@ -8727,8 +8830,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); @@ -8746,18 +8850,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@ -8787,7 +8891,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); @@ -9379,9 +9483,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) } fx_init(vcpu); - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -9502,7 +9605,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.cr2 = 0; kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; + vcpu->arch.apf.msr_en_val = 0; + vcpu->arch.apf.msr_int_val = 0; vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); @@ -10040,7 +10144,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); return; } @@ -10080,7 +10184,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } else { int level = kvm_dirty_log_manual_protect_and_init_set(kvm) ? - PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + PG_LEVEL_2M : PG_LEVEL_4K; /* * If we're with initial-all-set, we don't need @@ -10182,11 +10286,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops.nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && !is_smm(vcpu))) + (vcpu->arch.smi_pending && + kvm_x86_ops.smi_allowed(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -10197,6 +10302,11 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_hv_has_stimer_pending(vcpu)) return true; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + return true; + return false; } @@ -10233,7 +10343,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10298,12 +10408,14 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { + BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); } static inline u32 kvm_async_pf_next_probe(u32 key) { - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); + return (key + 1) & (ASYNC_PF_PER_VCPU - 1); } static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -10321,7 +10433,7 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) int i; u32 key = kvm_async_pf_hash_fn(gfn); - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + for (i = 0; i < ASYNC_PF_PER_VCPU && (vcpu->arch.apf.gfns[key] != gfn && vcpu->arch.apf.gfns[key] != ~0); i++) key = kvm_async_pf_next_probe(key); @@ -10339,6 +10451,10 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) u32 i, j, k; i = j = kvm_async_pf_gfn_slot(vcpu, gfn); + + if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn)) + return; + while (true) { vcpu->arch.apf.gfns[i] = ~0; do { @@ -10357,18 +10473,32 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) } } -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) { + u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason, + sizeof(reason)); } -static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) { + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, - sizeof(u32)); + return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &token, offset, sizeof(token)); +} + +static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) +{ + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); + u32 val; + + if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &val, offset, sizeof(val))) + return false; + + return !val; } static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) @@ -10376,9 +10506,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops.get_cpl(vcpu) == 0)) + if (!kvm_pv_async_pf_enabled(vcpu) || + (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0)) return false; return true; @@ -10398,7 +10527,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops.interrupt_allowed(vcpu); + return kvm_arch_interrupt_allowed(vcpu); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -10410,7 +10539,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + !apf_put_user_notpresent(vcpu)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -10434,8 +10563,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - struct x86_exception fault; - u32 val; + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vcpu->arch.apf.vec + }; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -10443,39 +10574,29 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); - if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && - !apf_get_user(vcpu, &val)) { - if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && - vcpu->arch.exception.pending && - vcpu->arch.exception.nr == PF_VECTOR && - !apf_put_user(vcpu, 0)) { - vcpu->arch.exception.injected = false; - vcpu->arch.exception.pending = false; - vcpu->arch.exception.nr = 0; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.error_code = 0; - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; - } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); - } + if (kvm_pv_async_pf_enabled(vcpu) && + !apf_put_user_ready(vcpu, work->arch.token)) { + vcpu->arch.apf.pageready_pending = true; + kvm_apic_set_irq(vcpu, &irq, NULL); } + vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_APF_READY, vcpu); + if (!vcpu->arch.apf.pageready_pending) + kvm_vcpu_kick(vcpu); +} + +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + if (!kvm_pv_async_pf_enabled(vcpu)) return true; else - return kvm_can_do_async_pf(vcpu); + return apf_pageready_slot_free(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b968acc0516f..6eb62e97e59f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -125,6 +125,12 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_current(vcpu); +} + static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); @@ -268,7 +274,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, bool kvm_vector_hashing_enabled(void); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 supported_xcr0; @@ -358,5 +364,6 @@ static inline bool kvm_dr7_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); +bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dffe8e4d3140..c5437f2964ee 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ +#include <asm/kvm_para.h> /* kvm_handle_async_pf */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -1359,6 +1360,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { prefetchw(¤t->mm->mmap_sem); + /* + * KVM has two types of events that are, logically, interrupts, but + * are unfortunately delivered using the #PF vector. These events are + * "you just accessed valid memory, but the host doesn't have it right + * now, so I'll put you to sleep if you continue" and "that memory + * you tried to access earlier is available now." + * + * We are relying on the interrupted context being sane (valid RSP, + * relevant locks not held, etc.), which is fine as long as the + * interrupted context had IF=1. We are also relying on the KVM + * async pf type field and CR2 being read consistently instead of + * getting values from real and async page faults mixed up. + * + * Fingers crossed. + */ + if (kvm_handle_async_pf(regs, (u32)address)) + return; + trace_page_fault_entries(regs, hw_error_code, address); if (unlikely(kmmio_fault(regs, address))) |