diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 1120 |
1 files changed, 911 insertions, 209 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 46b428c0990e..1519f030fd73 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -38,6 +38,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include <asm/asm.h> #include <asm/cpu.h> #include <asm/io.h> #include <asm/desc.h> @@ -332,23 +333,54 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + struct kvm_vmx { struct kvm kvm; unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; }; #define NR_AUTOLOAD_MSRS 8 +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + struct vmcs { - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; char data[0]; }; /* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +/* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs * loaded on this CPU (so we can clear them if the CPU goes down). @@ -359,14 +391,13 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; }; struct shared_msr_entry { @@ -397,7 +428,7 @@ struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ @@ -565,7 +596,7 @@ struct __packed vmcs12 { "Offset of " #field " in struct vmcs12 has changed.") static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(revision_id, 0); + CHECK_OFFSET(hdr, 0); CHECK_OFFSET(abort, 4); CHECK_OFFSET(launch_state, 8); CHECK_OFFSET(io_bitmap_a, 40); @@ -784,6 +815,12 @@ struct nested_vmx { */ struct vmcs12 *cached_vmcs12; /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 */ @@ -933,25 +970,20 @@ struct vcpu_vmx { /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { struct vmx_msrs guest; struct vmx_msrs host; } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - int gs_ldt_reload_needed; - int fs_reload_needed; - u64 msr_host_bndcfgs; - } host_state; + struct { int vm86_active; ulong save_rflags; @@ -1001,6 +1033,7 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; }; enum segment_cache_field { @@ -1220,6 +1253,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); @@ -1490,6 +1528,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) * GUEST_IA32_RTIT_CTL = 0x00002814, */ } + +/* check_ept_pointer() should be under protection of ept_pointer_lock. */ +static void check_ept_pointer_match(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + u64 tmp_eptp = INVALID_PAGE; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!VALID_PAGE(tmp_eptp)) { + tmp_eptp = to_vmx(vcpu)->ept_pointer; + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_MISMATCH; + return; + } + } + + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; +} + +static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +{ + int ret; + + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) + check_ept_pointer_match(kvm); + + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + ret = -ENOTSUPP; + goto out; + } + + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + +out: + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + return ret; +} #else /* !IS_ENABLED(CONFIG_HYPERV) */ static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -1864,6 +1944,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) CPU_BASED_MONITOR_TRAP_FLAG; } +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1944,6 +2030,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) VMX_VMFUNC_EPTP_SWITCHING); } +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1974,11 +2065,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; + bool error; - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) + : CC_OUT(na) (error) : "a"(&operand), "c"(ext) + : "memory"); + BUG_ON(error); } static inline void __invept(int ext, u64 eptp, gpa_t gpa) @@ -1986,11 +2078,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; + bool error; - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) + : CC_OUT(na) (error) : "a" (&operand), "c" (ext) + : "memory"); + BUG_ON(error); } static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) @@ -2006,12 +2099,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); } @@ -2028,15 +2121,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); } @@ -2114,6 +2207,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) @@ -2248,10 +2354,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) + : CC_OUT(na) (error) : "a"(value), "d"(field)); if (unlikely(error)) vmwrite_error(field, value); } @@ -2735,121 +2841,150 @@ static unsigned long segment_base(u16 selector) } #endif -static void vmx_save_host_state(struct kvm_vcpu *vcpu) +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); - unsigned long fs_base, kernel_gs_base; #endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; int i; - if (vmx->host_state.loaded) + if (vmx->loaded_cpu_state) return; - vmx->host_state.loaded = 1; + vmx->loaded_cpu_state = vmx->loaded_vmcs; + host_state = &vmx->loaded_cpu_state->host_state; + /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - kernel_gs_base = current->thread.gsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; } else { -#endif - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); -#ifdef CONFIG_X86_64 + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); - } -#endif - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); - vmx->host_state.fs_reload_needed = 0; - } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; - } - -#ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); - vmcs_writel(HOST_FS_BASE, fs_base); - vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); - - vmx->msr_host_kernel_gs_base = kernel_gs_base; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); #endif - if (boot_cpu_has(X86_FEATURE_MPX)) - rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); + + if (unlikely(fs_sel != host_state->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host_state->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host_state->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host_state->gs_sel = gs_sel; + } + if (unlikely(fs_base != host_state->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host_state->fs_base = fs_base; + } + if (unlikely(gs_base != host_state->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host_state->gs_base = gs_base; + } + for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, vmx->guest_msrs[i].mask); } -static void __vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) + struct vmcs_host_state *host_state; + + if (!vmx->loaded_cpu_state) return; + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + host_state = &vmx->loaded_cpu_state->host_state; + ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; + vmx->loaded_cpu_state = NULL; + #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); + load_gs_index(host_state->gs_sel); #else - loadsegment(gs, vmx->host_state.gs_sel); + loadsegment(gs, host_state->gs_sel); #endif } - if (vmx->host_state.fs_reload_needed) - loadsegment(fs, vmx->host_state.fs_sel); + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 - if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { - loadsegment(ds, vmx->host_state.ds_sel); - loadsegment(es, vmx->host_state.es_sel); + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (vmx->host_state.msr_host_bndcfgs) - wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); load_fixmap_gdt(raw_smp_processor_id()); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - preempt_disable(); - __vmx_load_host_state(vmx); - preempt_enable(); + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, + vmx->msr_guest_kernel_gs_base); + preempt_enable(); + } + return vmx->msr_guest_kernel_gs_base; } +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + } + vmx->msr_guest_kernel_gs_base = data; +} +#endif + static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -2991,7 +3126,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); - __vmx_load_host_state(to_vmx(vcpu)); + vmx_prepare_switch_to_host(to_vmx(vcpu)); } static bool emulation_required(struct kvm_vcpu *vcpu) @@ -3212,7 +3347,7 @@ static bool vmx_rdtscp_supported(void) static bool vmx_invpcid_supported(void) { - return cpu_has_vmx_invpcid() && enable_ept; + return cpu_has_vmx_invpcid(); } /* @@ -3455,6 +3590,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -3922,8 +4063,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - msr_info->data = vmx->msr_guest_kernel_gs_base; + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: @@ -4023,8 +4163,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - vmx->msr_guest_kernel_gs_base = data; + vmx_write_guest_kernel_gs_base(vmx, data); break; #endif case MSR_IA32_SYSENTER_CS: @@ -4559,7 +4698,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) { int node = cpu_to_node(cpu); struct page *pages; @@ -4573,10 +4712,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) /* KVM supports Enlightened VMCS v1 only */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = KVM_EVMCS_VERSION; + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; + if (shadow) + vmcs->hdr.shadow_vmcs = 1; return vmcs; } @@ -4600,14 +4741,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } -static struct vmcs *alloc_vmcs(void) +static struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); } static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { - loaded_vmcs->vmcs = alloc_vmcs(); + loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; @@ -4629,6 +4770,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) evmcs->hv_enlightenments_control.msr_bitmap = 1; } } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + return 0; out_vmcs: @@ -4738,7 +4882,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(false, cpu); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -4755,7 +4899,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; } @@ -4912,10 +5056,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) return; /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). + * MSR_KERNEL_GS_BASE is not intercepted when the guest is in + * 64-bit mode as a 64-bit kernel may frequently access the + * MSR. This means we need to manually save/restore the MSR + * when switching between guest and host state, but only if + * the guest is in 64-bit mode. Sync our cached value if the + * guest is transitioning to 32-bit mode and the CPU contains + * guest state, i.e. the cache is stale. */ - vmx_load_host_state(to_vmx(vcpu)); +#ifdef CONFIG_X86_64 + if (!(efer & EFER_LMA)) + (void)vmx_read_guest_kernel_gs_base(vmx); +#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -4972,6 +5124,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + int vpid = to_vmx(vcpu)->vpid; + + if (!vpid_sync_vcpu_addr(vpid, addr)) + vpid_sync_context(vpid); + + /* + * If VPIDs are not supported or enabled, then the above is a no-op. + * But we don't really need a TLB flush in that case anyway, because + * each VM entry/exit includes an implicit flush when VPID is 0. + */ +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -5153,6 +5319,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm *kvm = vcpu->kvm; unsigned long guest_cr3; u64 eptp; @@ -5160,15 +5327,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); + + if (kvm_x86_ops->tlb_remote_flush) { + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + to_vmx(vcpu)->ept_pointer = eptp; + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_CHECK; + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + } + if (enable_unrestricted_guest || is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -6104,19 +6279,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in - * __vmx_load_host_state(), in case userspace uses the null selectors - * too (the expected case). + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); @@ -6241,8 +6416,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; - /* Enable INVPCID for non-ept guests may cause performance regression. */ - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -6371,9 +6544,6 @@ static void ept_set_mmio_spte_mask(void) */ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { -#ifdef CONFIG_X86_64 - unsigned long a; -#endif int i; if (enable_shadow_vmcs) { @@ -6428,15 +6598,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); @@ -7670,6 +7833,7 @@ static void vmx_enable_tdp(void) static __init int hardware_setup(void) { + unsigned long host_bndcfgs; int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7694,6 +7858,11 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7730,6 +7899,12 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; @@ -7756,6 +7931,11 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -8032,10 +8212,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) return 0; } +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); @@ -8046,16 +8251,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); @@ -8067,6 +8268,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) return 0; out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: @@ -8109,7 +8313,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* CPL=0 must be checked manually. */ if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 1; } @@ -8172,15 +8376,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - if (vmx_get_cpl(vcpu)) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); return 0; } + return 1; } @@ -8233,6 +8438,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -8318,7 +8524,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); @@ -8327,7 +8533,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - p = ((char *)(get_vmcs12(vcpu))) + offset; + p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: @@ -8349,10 +8555,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, } -static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); - char *p = ((char *) get_vmcs12(vcpu)) + offset; + char *p = (char *)vmcs12 + offset; if (offset < 0) return offset; @@ -8405,7 +8611,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; field_value = __vmcs_readl(field); - vmcs12_write_any(&vmx->vcpu, field, field_value); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); } /* * Skip the VM-exit information fields if they are read-only. @@ -8440,7 +8646,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(&vmx->vcpu, field, &field_value); + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); __vmcs_writel(field, field_value); } } @@ -8470,6 +8676,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8477,10 +8684,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_vmcs12(vcpu)) return kvm_skip_emulated_instruction(vcpu); + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + } + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vcpu, field, &field_value) < 0) { + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8522,6 +8743,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8556,23 +8778,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(vcpu, field, field_value) < 0) { + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } - switch (field) { + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { #define SHADOW_FIELD_RW(x) case x: #include "vmx_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } } nested_vmx_succeed(vcpu); @@ -8623,7 +8866,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); - if (new_vmcs12->revision_id != VMCS12_REVISION) { + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, @@ -8821,6 +9066,105 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} + static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -9024,6 +9368,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; @@ -9196,6 +9541,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + /* * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we * should handle it ourselves in L0 (and then continue L2). Only call this @@ -9280,10 +9649,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* @@ -10244,15 +10618,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the @@ -10448,9 +10822,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * - * We can't defer this to vmx_load_host_state() since that function - * may be executed in interrupt context, which saves and restore segments - * around it, nullifying its effect. + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); @@ -10511,8 +10885,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); } @@ -10652,6 +11026,8 @@ free_vcpu: static int vmx_vm_init(struct kvm *kvm) { + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -10876,11 +11252,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) return 1; - kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu)); + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -10928,9 +11304,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct page *page; u64 hpa; @@ -11171,6 +11547,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -11228,11 +11636,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, unsigned long addr_field) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int maxphyaddr; u64 count, addr; - if (vmcs12_read_any(vcpu, count_field, &count) || - vmcs12_read_any(vcpu, addr_field, &addr)) { + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { WARN_ON(1); return -EINVAL; } @@ -11282,6 +11691,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -11431,12 +11853,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 1; } } - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } - kvm_mmu_reset_context(vcpu); + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3, false); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + return 0; } @@ -11523,7 +11949,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set host-state according to L0's settings (vmcs12 is irrelevant here) * Some constant fields are set here by vmx_set_constant_host_state(). * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() + * is called. */ vmx_set_constant_host_state(vmx); @@ -11595,11 +12022,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - /* - * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, - * HOST_FS_BASE, HOST_GS_BASE. - */ - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); @@ -11664,6 +12086,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); @@ -11883,6 +12308,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high) || @@ -11983,6 +12411,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return 0; } +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) { @@ -11994,8 +12449,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && - vmcs12->vmcs_link_pointer != -1ull) { + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } @@ -12042,12 +12496,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) +/* + * If exit_qual is NULL, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. + */ +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_qual; - int r; + bool from_vmentry = !!exit_qual; + u32 dummy_exit_qual; + int r = 0; enter_guest_mode(vcpu); @@ -12061,17 +12520,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) vcpu->arch.tsc_offset += vmcs12->tsc_offset; r = EXIT_REASON_INVALID_STATE; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) goto fail; - nested_get_vmcs12_pages(vcpu, vmcs12); + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); - r = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) - goto fail; + r = EXIT_REASON_MSR_LOAD_FAIL; + *exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (*exit_qual) + goto fail; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -12086,8 +12556,7 @@ fail: vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); - return 1; + return r; } /* @@ -12110,6 +12579,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12 = get_vmcs12(vcpu); + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) { + nested_vmx_failInvalid(vcpu); + goto out; + } + if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); @@ -12164,16 +12644,29 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); vmx->nested.nested_run_pending = 0; - return ret; + return 1; } /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; /* + * Must happen outside of enter_vmx_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu. */ @@ -12682,6 +13175,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); @@ -13256,7 +13760,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) if (vmx->nested.smm.guest_mode) { vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, NULL); vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; @@ -13271,6 +13775,199 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx->nested.current_vmptr != -1ull) { + kvm_state.size += VMCS12_SIZE; + + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (vmx->nested.current_vmptr == -1ull) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow vmcs linked to vmcs01, unless + * sync_shadow_vmcs is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) + sync_vmcs12(vcpu, vmcs12); + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + +out: + return kvm_state.size; +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->hdr.revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vmx->nested.nested_run_pending = 1; + + vmx->nested.dirty_vmcs12 = true; + ret = enter_vmx_non_root_mode(vcpu, NULL); + if (ret) + return -EINVAL; + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -13290,7 +13987,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .prepare_guest_switch = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -13323,6 +14020,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, @@ -13405,6 +14103,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_nested_state = vmx_get_nested_state, + .set_nested_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, |