diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 2393 |
1 files changed, 1815 insertions, 578 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 801332edefc3..f5e8dce8046c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include <asm/perf_event.h> #include <asm/debugreg.h> #include <asm/kexec.h> +#include <asm/apic.h> #include "trace.h" @@ -99,13 +100,18 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); +static u64 __read_mostly host_xss; + +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT) + | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -125,14 +131,32 @@ module_param(nested, bool, S_IRUGO); * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_GAP 128 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 +#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 +#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ + INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW + static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +/* Default doubles per-vcpu window every exit. */ +static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, int, S_IRUGO); + +/* Default resets per-vcpu window every exit to ple_window. */ +static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, int, S_IRUGO); + +/* Default is to compute the maximum so we can never overflow. */ +static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, int, S_IRUGO); + extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 @@ -195,7 +219,13 @@ struct __packed vmcs12 { u64 tsc_offset; u64 virtual_apic_page_addr; u64 apic_access_addr; + u64 posted_intr_desc_addr; u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; u64 guest_ia32_debugctl; @@ -309,6 +339,7 @@ struct __packed vmcs12 { u32 vmx_preemption_timer_value; u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; + u16 posted_intr_nv; u16 guest_es_selector; u16 guest_cs_selector; u16 guest_ss_selector; @@ -317,6 +348,7 @@ struct __packed vmcs12 { u16 guest_gs_selector; u16 guest_ldtr_selector; u16 guest_tr_selector; + u16 guest_intr_status; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -379,10 +411,35 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; + + u32 nested_vmx_procbased_ctls_low; + u32 nested_vmx_procbased_ctls_high; + u32 nested_vmx_true_procbased_ctls_low; + u32 nested_vmx_secondary_ctls_low; + u32 nested_vmx_secondary_ctls_high; + u32 nested_vmx_pinbased_ctls_low; + u32 nested_vmx_pinbased_ctls_high; + u32 nested_vmx_exit_ctls_low; + u32 nested_vmx_exit_ctls_high; + u32 nested_vmx_true_exit_ctls_low; + u32 nested_vmx_entry_ctls_low; + u32 nested_vmx_entry_ctls_high; + u32 nested_vmx_true_entry_ctls_low; + u32 nested_vmx_misc_low; + u32 nested_vmx_misc_high; + u32 nested_vmx_ept_caps; }; #define POSTED_INTR_ON 0 @@ -450,6 +507,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -481,6 +539,14 @@ struct vcpu_vmx { /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; }; enum segment_cache_field { @@ -530,6 +596,7 @@ static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static unsigned long shadow_read_write_fields[] = { + TPR_THRESHOLD, GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -563,6 +630,7 @@ static int max_shadow_read_write_fields = static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector), FIELD(GUEST_SS_SELECTOR, guest_ss_selector), @@ -571,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_GS_SELECTOR, guest_gs_selector), FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -587,7 +656,13 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(TSC_OFFSET, tsc_offset), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), @@ -692,12 +767,15 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); static inline short vmcs_field_to_offset(unsigned long field) { - if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) - return -1; + BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || + vmcs_field_to_offset_table[field] == 0) + return -ENOENT; + return vmcs_field_to_offset_table[field]; } @@ -730,6 +808,8 @@ static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); +static bool vmx_xsaves_supported(void); +static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -740,7 +820,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_mpx_supported(void); +static int alloc_identity_pagetable(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -757,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -820,7 +901,6 @@ static const u32 vmx_msr_index[] = { #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) static inline bool is_page_fault(u32 intr_info) { @@ -924,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void) return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } -static inline bool cpu_has_vmx_eptp_uncacheable(void) -{ - return vmx_capability.ept & VMX_EPTP_UC_BIT; -} - -static inline bool cpu_has_vmx_eptp_writeback(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - static inline bool cpu_has_vmx_ept_2m_page(void) { return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; @@ -1038,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void) SECONDARY_EXEC_SHADOW_VMCS; } +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1071,6 +1146,32 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); } +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && + vmx_xsaves_supported(); +} + +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1600,7 +1701,7 @@ static void reload_tss(void) /* * VT restores TR but not its size. Useless. */ - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *descs; descs = (void *)gdt->address; @@ -1632,12 +1733,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) vmx->guest_msrs[efer_offset].mask = ~ignore_bits; clear_atomic_switch_msr(vmx, MSR_EFER); - /* On ept, can't emulate nx, and must switch nx atomically */ - if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + + /* + * On EPT, we can't emulate NX, so we must switch EFER atomically. + * On CPUs that support "load IA32_EFER", always switch EFER + * atomically, since it's faster than switching it manually. + */ + if (cpu_has_load_ia32_efer || + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { guest_efer = vmx->vcpu.arch.efer; if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); return false; } @@ -1646,7 +1755,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -1776,7 +1885,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) */ if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) stts(); - load_gdt(&__get_cpu_var(host_gdt)); + load_gdt(this_cpu_ptr(&host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -1806,7 +1915,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -1940,7 +2049,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; @@ -1950,7 +2059,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -2059,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { unsigned long *msr_bitmap; - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_guest_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_nested; + else if (irqchip_in_kernel(vcpu->kvm) && + apic_x2apic_mode(vcpu->arch.apic)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -2134,7 +2246,7 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { u64 tsc_offset; @@ -2235,17 +2347,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). - * TODO: allow these variables to be modified (downgraded) by module options - * or other means. */ -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; -static u32 nested_vmx_misc_low, nested_vmx_misc_high; -static u32 nested_vmx_ept_caps; -static __init void nested_vmx_setup_ctls_msrs(void) +static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) { /* * Note that as a general rule, the high half of the MSRs (bits in @@ -2264,58 +2367,74 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - /* - * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is - * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. - */ - nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); + vmx->nested.nested_vmx_pinbased_ctls_low |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_POSTED_INTR; - /* - * Exit controls - * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and - * 17 must be 1. - */ + /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); - nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); + vmx->nested.nested_vmx_exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_exit_ctls_high &= + vmx->nested.nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; if (vmx_mpx_supported()) - nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + + /* We support free control of debug control saving. */ + vmx->nested.nested_vmx_true_exit_ctls_low = + vmx->nested.nested_vmx_exit_ctls_low & + ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ - nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_entry_ctls_high &= + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); + vmx->nested.nested_vmx_entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT; - nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | - VM_ENTRY_LOAD_IA32_EFER); + vmx->nested.nested_vmx_entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); if (vmx_mpx_supported()) - nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + + /* We support free control of debug control loading. */ + vmx->nested.nested_vmx_true_entry_ctls_low = + vmx->nested.nested_vmx_entry_ctls_low & + ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = 0; - nested_vmx_procbased_ctls_high &= + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); + vmx->nested.nested_vmx_procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2327,7 +2446,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | + CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2335,39 +2454,59 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; + vmx->nested.nested_vmx_procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; + + /* We support free control of CR3 access interception. */ + vmx->nested.nested_vmx_true_procbased_ctls_low = + vmx->nested.nested_vmx_procbased_ctls_low & + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); - nested_vmx_secondary_ctls_low = 0; - nested_vmx_secondary_ctls_high &= + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); + vmx->nested.nested_vmx_secondary_ctls_low = 0; + vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; - nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT; + vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; - nested_vmx_ept_caps &= vmx_capability.ept; + vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific * for single context invalidation. Hence, only advertise * support for global context invalidation. */ - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; } else - nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_ept_caps = 0; + + if (enable_unrestricted_guest) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + rdmsr(MSR_IA32_VMX_MISC, + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + vmx->nested.nested_vmx_misc_low |= + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; - nested_vmx_misc_high = 0; + vmx->nested.nested_vmx_misc_high = 0; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2386,6 +2525,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high) /* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + switch (msr_index) { case MSR_IA32_VMX_BASIC: /* @@ -2394,33 +2535,50 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - *pdata = VMCS12_REVISION | + *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, - nested_vmx_pinbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); + break; case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); + break; case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); + break; case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr(nested_vmx_misc_low, - nested_vmx_misc_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); break; /* * These MSRs specify bits which the guest must keep fixed (on or off) @@ -2442,15 +2600,16 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = -1ULL; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x1f; + *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, - nested_vmx_secondary_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps; break; default: return 1; @@ -2515,6 +2674,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_get_vmx_msr(vcpu, msr_index, pdata); + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + data = vcpu->arch.ia32_xss; + break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; @@ -2584,6 +2748,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; @@ -2604,6 +2770,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return 1; /* they are read-only */ + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + /* + * The only supported bit as of Skylake is bit 8, but + * it is not supported on KVM. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + if (vcpu->arch.ia32_xss != host_xss) + add_atomic_switch_msr(vmx, MSR_IA32_XSS, + vcpu->arch.ia32_xss, host_xss); + else + clear_atomic_switch_msr(vmx, MSR_IA32_XSS); + break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2614,12 +2796,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: msr = find_msr_entry(vmx, msr_index); if (msr) { + u64 old_msr_data = msr->data; msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); preempt_enable(); + if (ret) + msr->data = old_msr_data; } break; } @@ -2687,13 +2872,13 @@ static void kvm_cpu_vmxon(u64 addr) : "memory", "cc"); } -static int hardware_enable(void *garbage) +static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old, test_bits; - if (read_cr4() & X86_CR4_VMXE) + if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); @@ -2720,14 +2905,14 @@ static int hardware_enable(void *garbage) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + cr4_set_bits(X86_CR4_VMXE); if (vmm_exclusive) { kvm_cpu_vmxon(phys_addr); ept_sync_global(); } - native_store_gdt(&__get_cpu_var(host_gdt)); + native_store_gdt(this_cpu_ptr(&host_gdt)); return 0; } @@ -2751,13 +2936,13 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); } -static void hardware_disable(void *garbage) +static void hardware_disable(void) { if (vmm_exclusive) { vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } - write_cr4(read_cr4() & ~X86_CR4_VMXE); + cr4_clear_bits(X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -2836,7 +3021,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS; + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2959,6 +3146,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } + if (cpu_has_xsaves) + rdmsrl(MSR_IA32_XSS, host_xss); + return 0; } @@ -3062,68 +3252,6 @@ static __init int alloc_kvm_area(void) return 0; } -static __init int hardware_setup(void) -{ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (!cpu_has_vmx_vpid()) - enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { - enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } - - if (!cpu_has_vmx_ept_ad_bits()) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest()) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) - flexpriority_enabled = 0; - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - - if (!cpu_has_vmx_apicv()) - enable_apicv = 0; - - if (enable_apicv) - kvm_x86_ops->update_cr8_intercept = NULL; - else { - kvm_x86_ops->hwapic_irr_update = NULL; - kvm_x86_ops->deliver_posted_interrupt = NULL; - kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; - } - - if (nested) - nested_vmx_setup_ctls_msrs(); - - return alloc_kvm_area(); -} - -static __exit void hardware_unsetup(void) -{ - free_kvm_area(); -} - static bool emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !guest_state_valid(vcpu); @@ -3141,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, * default value. */ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - save->selector &= ~SELECTOR_RPL_MASK; - save->dpl = save->selector & SELECTOR_RPL_MASK; + save->selector &= ~SEGMENT_RPL_MASK; + save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } vmx_set_segment(vcpu, save, seg); @@ -3653,7 +3781,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); out: - vmx->emulation_required |= emulation_required(vcpu); + vmx->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3715,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) unsigned int cs_rpl; vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - cs_rpl = cs.selector & SELECTOR_RPL_MASK; + cs_rpl = cs.selector & SEGMENT_RPL_MASK; if (cs.unusable) return false; @@ -3743,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu) unsigned int ss_rpl; vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - ss_rpl = ss.selector & SELECTOR_RPL_MASK; + ss_rpl = ss.selector & SEGMENT_RPL_MASK; if (ss.unusable) return true; @@ -3765,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) unsigned int rpl; vmx_get_segment(vcpu, &var, seg); - rpl = var.selector & SELECTOR_RPL_MASK; + rpl = var.selector & SEGMENT_RPL_MASK; if (var.unusable) return true; @@ -3792,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu) if (tr.unusable) return false; - if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; @@ -3810,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) if (ldtr.unusable) return true; - if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (ldtr.type != 2) return false; @@ -3827,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - return ((cs.selector & SELECTOR_RPL_MASK) == - (ss.selector & SELECTOR_RPL_MASK)); + return ((cs.selector & SEGMENT_RPL_MASK) == + (ss.selector & SEGMENT_RPL_MASK)); } /* @@ -3888,7 +4016,7 @@ static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; u16 data = 0; - int r, idx, ret = 0; + int idx, r; idx = srcu_read_lock(&kvm->srcu); fn = kvm->arch.tss_addr >> PAGE_SHIFT; @@ -3910,32 +4038,32 @@ static int init_rmode_tss(struct kvm *kvm) r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, sizeof(u8)); - if (r < 0) - goto out; - - ret = 1; out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + return r; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, idx, r, ret; + int i, idx, r = 0; pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) - return 1; - if (unlikely(!kvm->arch.ept_identity_pagetable)) { - printk(KERN_ERR "EPT: identity-mapping pagetable " - "haven't been allocated!\n"); return 0; - } + + /* Protect kvm->arch.ept_identity_pagetable_done. */ + mutex_lock(&kvm->slots_lock); + if (likely(kvm->arch.ept_identity_pagetable_done)) - return 1; - ret = 0; + goto out2; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + + r = alloc_identity_pagetable(kvm); + if (r < 0) + goto out2; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) @@ -3950,10 +4078,13 @@ static int init_rmode_identity_map(struct kvm *kvm) goto out; } kvm->arch.ept_identity_pagetable_done = true; - ret = 1; + out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + +out2: + mutex_unlock(&kvm->slots_lock); + return r; } static void seg_setup(int seg) @@ -3978,23 +4109,28 @@ static int alloc_apic_access_page(struct kvm *kvm) int r = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page) + if (kvm->arch.apic_access_page_done) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; - page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { r = -EFAULT; goto out; } - kvm->arch.apic_access_page = page; + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -4002,31 +4138,20 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { - struct page *page; + /* Called with kvm->slots_lock held. */ + struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(&kvm->slots_lock); - if (kvm->arch.ept_identity_pagetable) - goto out; + BUG_ON(kvm->arch.ept_identity_pagetable_done); + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); - if (r) - goto out; - page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - - kvm->arch.ept_identity_pagetable = page; -out: - mutex_unlock(&kvm->slots_lock); return r; } @@ -4128,6 +4253,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, } } +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) { + WARN_ON(1); + return; + } + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) @@ -4166,6 +4337,74 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (vmx->nested.pi_desc && + vmx->nested.pi_pending) { + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return 0; + + max_irr = find_last_bit( + (unsigned long *)vmx->nested.pi_desc->pir, 256); + + if (max_irr == 256) + return 0; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + if (!vapic_page) { + WARN_ON(1); + return -ENOMEM; + } + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); + + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + return 0; +} + +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + return true; + } +#endif + return false; +} + +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { + /* the PIR and ON have been set by L1. */ + kvm_vcpu_trigger_posted_interrupt(vcpu); + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + return -1; +} /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt @@ -4178,17 +4417,16 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) struct vcpu_vmx *vmx = to_vmx(vcpu); int r; + r = vmx_deliver_nested_posted_interrupt(vcpu, vector); + if (!r) + return; + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return; r = pi_test_and_set_on(&vmx->pi_desc); kvm_make_request(KVM_REQ_EVENT, vcpu); -#ifdef CONFIG_SMP - if (!r && (vcpu->mode == IN_GUEST_MODE)) - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); - else -#endif + if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) kvm_vcpu_kick(vcpu); } @@ -4218,11 +4456,16 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; + unsigned long cr4; vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = cr4_read_shadow(); + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmx->host_state.vmcs_host_cr4 = cr4; + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* @@ -4324,6 +4567,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + /* PML is enabled/disabled in creating/destorying vcpu */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + return exec_control; } @@ -4338,6 +4584,7 @@ static void ept_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } +#define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. */ @@ -4385,7 +4632,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); - vmcs_write32(PLE_WINDOW, ple_window); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); @@ -4422,7 +4670,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.pat = host_pat; } - for (i = 0; i < NR_VMX_MSR; ++i) { + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; int j = vmx->nmsrs; @@ -4446,6 +4694,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); + if (vmx_xsaves_supported()) + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + return 0; } @@ -4460,8 +4711,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_bsp(&vmx->vcpu)) + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) apic_base_msr.data |= MSR_IA32_APICBASE_BSP; apic_base_msr.host_initiated = true; kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); @@ -4520,9 +4771,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); @@ -4712,10 +4961,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; - if (!init_rmode_tss(kvm)) - return -ENOMEM; - - return 0; + return init_rmode_tss(kvm); } static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) @@ -4761,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); + return kvm_vcpu_halt(vcpu); } return 1; } @@ -4826,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_invalid_opcode(intr_info)) { + if (is_guest_mode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); @@ -4845,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; + vcpu->run->internal.data[2] = error_code; return 0; } @@ -4873,7 +5124,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ skip_emulated_instruction(vcpu); @@ -4950,11 +5201,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long always_on = VMXON_CR0_ALWAYSON; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_secondary_ctls_high & + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) always_on &= ~(X86_CR0_PE | X86_CR0_PG); @@ -4979,7 +5231,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vmcs12, val)) + if (!nested_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5039,7 +5291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - val = kvm_register_read(vcpu, reg); + val = kvm_register_readl(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -5056,7 +5308,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = kvm_register_read(vcpu, reg); + u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); if (irqchip_in_kernel(vcpu->kvm)) @@ -5109,13 +5361,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int dr, reg; + int dr, dr7, reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + + /* First, if DR does not exist, trigger UD */ + if (!kvm_require_dr(vcpu, dr)) + return 1; /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; - dr = vmcs_readl(GUEST_DR7); - if (dr & DR7_GD) { + dr7 = vmcs_readl(GUEST_DR7); + if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the @@ -5123,17 +5382,14 @@ static int handle_dr(struct kvm_vcpu *vcpu) */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr; - vcpu->run->debug.arch.pc = - vmcs_readl(GUEST_CS_BASE) + - vmcs_readl(GUEST_RIP); + vcpu->run->debug.arch.dr7 = dr7; + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr7 &= ~DR7_GD; - vcpu->arch.dr6 |= DR6_BD; - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BD | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -5155,8 +5411,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; } - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; @@ -5165,7 +5419,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; kvm_register_write(vcpu, reg, val); } else - if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; skip_emulated_instruction(vcpu); @@ -5240,7 +5494,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) msr.data = data; msr.index = ecx; msr.host_initiated = false; - if (vmx_set_msr(vcpu, &msr) != 0) { + if (kvm_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -5285,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) static int handle_halt(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu); } static int handle_vmcall(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); kvm_emulate_hypercall(vcpu); return 1; } @@ -5322,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) static int handle_wbinvd(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); kvm_emulate_wbinvd(vcpu); return 1; } @@ -5337,6 +5588,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } +static int handle_xsaves(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + +static int handle_xrstors(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5438,7 +5703,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); /* * TODO: What about debug traps on tss switch? @@ -5485,11 +5750,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ - error_code = exit_qualification & (1U << 1); + error_code = exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ - error_code |= (exit_qualification & (1U << 2)) << 2; + error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & 0x1; + error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; vcpu->arch.exit_qualification = exit_qualification; @@ -5504,17 +5769,18 @@ static u64 ept_rsvd_mask(u64 spte, int level) for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) mask |= (1ULL << i); - if (level > 2) + if (level == 4) /* bits 7:3 reserved */ mask |= 0xf8; - else if (level == 2) { - if (spte & (1ULL << 7)) - /* 2MB ref, bits 20:12 reserved */ - mask |= 0x1ff000; - else - /* bits 6:3 reserved */ - mask |= 0x78; - } + else if (spte & (1ULL << 7)) + /* + * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, + * level == 1 if the hypervisor is using the ignored bit 7. + */ + mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; + else if (level > 1) + /* bits 6:3 reserved */ + mask |= 0x78; return mask; } @@ -5544,7 +5810,8 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, WARN_ON(1); } - if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + /* bits 5:3 are _not_ reserved for large page or leaf page */ + if ((rsvd_bits & 0x38) == 0) { u64 ept_mem_type = (spte & 0x38) >> 3; if (ept_mem_type == 2 || ept_mem_type == 3 || @@ -5564,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); return 1; } @@ -5621,7 +5888,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu) && count-- != 0) { + while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5645,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - ret = kvm_emulate_halt(vcpu); + ret = kvm_vcpu_halt(vcpu); goto out; } @@ -5655,17 +5922,314 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = emulation_required(vcpu); out: return ret; } +static int __grow_ple_window(int val) +{ + if (ple_window_grow < 1) + return ple_window; + + val = min(val, ple_window_actual_max); + + if (ple_window_grow < ple_window) + val *= ple_window_grow; + else + val += ple_window_grow; + + return val; +} + +static int __shrink_ple_window(int val, int modifier, int minimum) +{ + if (modifier < 1) + return ple_window; + + if (modifier < ple_window) + val /= modifier; + else + val -= modifier; + + return max(val, minimum); +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, + ple_window_shrink, ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * ple_window_actual_max is computed to be one grow_ple_window() below + * ple_window_max. (See __grow_ple_window for the reason.) + * This prevents overflows, because ple_window_max is int. + * ple_window_max effectively rounded down to a multiple of ple_window_grow in + * this process. + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. + */ +static void update_ple_window_actual_max(void) +{ + ple_window_actual_max = + __shrink_ple_window(max(ple_window_max, ple_window), + ple_window_grow, INT_MIN); +} + +static __init int hardware_setup(void) +{ + int r = -ENOMEM, i, msr; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_a) + return r; + + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_b) + goto out; + + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) + goto out1; + + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) + goto out3; + + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + + if (nested) { + vmx_msr_bitmap_nested = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_nested) + goto out5; + } + + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out6; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out7; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + if (nested) + memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); + + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; + goto out8; + } + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { + enable_ept = 0; + enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; + } + + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) + kvm_x86_ops->set_apic_access_page_addr = NULL; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + + if (!cpu_has_vmx_ple()) + ple_gap = 0; + + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; + + if (enable_apicv) + kvm_x86_ops->update_cr8_intercept = NULL; + else { + kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->hwapic_isr_update = NULL; + kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } + + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } + + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + update_ple_window_actual_max(); + + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + + return alloc_kvm_area(); + +out8: + free_page((unsigned long)vmx_vmwrite_bitmap); +out7: + free_page((unsigned long)vmx_vmread_bitmap); +out6: + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +out4: + free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +out2: + free_page((unsigned long)vmx_msr_bitmap_legacy); +out1: + free_page((unsigned long)vmx_io_bitmap_b); +out: + free_page((unsigned long)vmx_io_bitmap_a); + + return r; +} + +static __exit void hardware_unsetup(void) +{ + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); + + free_kvm_area(); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ static int handle_pause(struct kvm_vcpu *vcpu) { + if (ple_gap) + grow_ple_window(vcpu); + skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); @@ -5754,22 +6318,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) /* * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. + * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs + * must be &vmx->vmcs01. */ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) { struct vmcs02_list *item, *n; + + WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - if (vmx->loaded_vmcs != &item->vmcs02) - free_loaded_vmcs(&item->vmcs02); + /* + * Something will leak if the above WARN triggers. Better than + * a use-after-free. + */ + if (vmx->loaded_vmcs == &item->vmcs02) + continue; + + free_loaded_vmcs(&item->vmcs02); list_del(&item->list); kfree(item); + vmx->nested.vmcs02_num--; } - vmx->nested.vmcs02_num = 0; - - if (vmx->loaded_vmcs != &vmx->vmcs01) - free_loaded_vmcs(&vmx->vmcs01); } /* @@ -5814,6 +6383,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); +} + static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = @@ -5918,7 +6494,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, * which replaces physical address width with 32 * */ - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -5936,7 +6512,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -5951,7 +6527,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, } break; case EXIT_REASON_VMPTRLD: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -6086,20 +6662,28 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { u32 exec_control; + if (vmx->nested.current_vmptr == -1ull) + return; + + /* current_vmptr and current_vmcs12 are always set/reset together */ + if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) + return; + if (enable_shadow_vmcs) { - if (vmx->nested.current_vmcs12 != NULL) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - vmcs_write64(VMCS_LINK_POINTER, -1ull); - } + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); } + vmx->nested.posted_intr_nv = -1; kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; } /* @@ -6110,18 +6694,25 @@ static void free_nested(struct vcpu_vmx *vmx) { if (!vmx->nested.vmxon) return; + vmx->nested.vmxon = false; - if (vmx->nested.current_vmptr != -1ull) { - nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } + nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; } nested_free_all_saved_vmcss(vmx); @@ -6152,11 +6743,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (vmptr == vmx->nested.current_vmptr) { + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } page = nested_get_page(vcpu, vmptr); if (page == NULL) { @@ -6223,58 +6811,60 @@ static inline int vmcs_field_readonly(unsigned long field) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) +static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); char *p; if (offset < 0) - return 0; + return offset; p = ((char *)(get_vmcs12(vcpu))) + offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_NATURAL_WIDTH: *ret = *((natural_width *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U16: *ret = *((u16 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U32: *ret = *((u32 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U64: *ret = *((u64 *)p); - return 1; + return 0; default: - return 0; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } -static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 field_value){ +static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); char *p = ((char *) get_vmcs12(vcpu)) + offset; if (offset < 0) - return false; + return offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: *(u16 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U32: *(u32 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U64: *(u64 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_NATURAL_WIDTH: *(natural_width *)p = field_value; - return true; + return 0; default: - return false; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } @@ -6288,6 +6878,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + preempt_disable(); + vmcs_load(shadow_vmcs); for (i = 0; i < num_fields; i++) { @@ -6305,12 +6897,17 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: field_value = vmcs_readl(field); break; + default: + WARN_ON(1); + continue; } vmcs12_write_any(&vmx->vcpu, field, field_value); } vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); } static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) @@ -6348,6 +6945,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: vmcs_writel(field, (long)field_value); break; + default: + WARN_ON(1); + break; } } } @@ -6384,9 +6984,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* Decode instruction info and find the field to read */ - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, &field_value)) { + if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6397,7 +6997,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (vmx_instruction_info & (1u << 10)) { - kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, @@ -6434,21 +7034,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_read(vcpu, + field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, - &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { + &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); @@ -6456,7 +7056,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; } - if (!vmcs12_write_any(vcpu, field, field_value)) { + if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6498,9 +7098,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) - nested_release_vmcs12(vmx); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; @@ -6548,6 +7147,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; unsigned long type; gva_t gva; @@ -6556,8 +7156,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) u64 eptp, gpa; } operand; - if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || - !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6571,9 +7172,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, @@ -6596,7 +7197,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: @@ -6609,6 +7210,37 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6654,6 +7286,10 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_XSAVES] = handle_xsaves, + [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, }; static const int kvm_vmx_max_exit_handlers = @@ -6685,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, else if (port < 0x10000) bitmap = vmcs12->io_bitmap_b; else - return 1; + return true; bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) - return 1; + return true; if (b & (1 << (port & 7))) - return 1; + return true; port++; size--; last_bitmap = bitmap; } - return 0; + return false; } /* @@ -6715,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return 1; + return true; /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, @@ -6734,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, if (msr_index < 1024*8) { unsigned char b; if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) - return 1; + return true; return 1 & (b >> (msr_index & 7)); } else - return 1; /* let L1 handle the wrong parameter */ + return true; /* let L1 handle the wrong parameter */ } /* @@ -6751,7 +7387,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_read(vcpu, reg); + unsigned long val = kvm_register_readl(vcpu, reg); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ @@ -6759,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, case 0: if (vmcs12->cr0_guest_host_mask & (val ^ vmcs12->cr0_read_shadow)) - return 1; + return true; break; case 3: if ((vmcs12->cr3_target_count >= 1 && @@ -6770,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, vmcs12->cr3_target_value2 == val) || (vmcs12->cr3_target_count >= 4 && vmcs12->cr3_target_value3 == val)) - return 0; + return false; if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) - return 1; + return true; break; case 4: if (vmcs12->cr4_guest_host_mask & (vmcs12->cr4_read_shadow ^ val)) - return 1; + return true; break; case 8: if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) - return 1; + return true; break; } break; case 2: /* clts */ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && (vmcs12->cr0_read_shadow & X86_CR0_TS)) - return 1; + return true; break; case 1: /* mov from cr */ switch (cr) { case 3: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR3_STORE_EXITING) - return 1; + return true; break; case 8: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR8_STORE_EXITING) - return 1; + return true; break; } break; @@ -6811,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, */ if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) - return 1; + return true; if ((vmcs12->cr0_guest_host_mask & 0x1) && !(vmcs12->cr0_read_shadow & 0x1) && (val & 0x1)) - return 1; + return true; break; } - return 0; + return false; } /* @@ -6841,58 +7477,60 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) KVM_ISA_VMX); if (vmx->nested.nested_run_pending) - return 0; + return false; if (unlikely(vmx->fail)) { pr_info_ratelimited("%s failed vm entry %x\n", __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); - return 1; + return true; } switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: if (!is_exception(intr_info)) - return 0; + return false; else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) - return 0; + return false; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: - return 0; + return false; case EXIT_REASON_TRIPLE_FAULT: - return 1; + return true; case EXIT_REASON_PENDING_INTERRUPT: return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); case EXIT_REASON_TASK_SWITCH: - return 1; + return true; case EXIT_REASON_CPUID: - return 1; + if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) + return false; + return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: - return 1; + return true; case EXIT_REASON_INVLPG: return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); - case EXIT_REASON_RDTSC: + case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! */ - return 1; + return true; case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: @@ -6903,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: - return 1; + return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_INSTRUCTION: @@ -6913,12 +7551,16 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: - return 0; + return false; case EXIT_REASON_TPR_BELOW_THRESHOLD: - return 1; + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* apic_write and eoi_induced should exit unconditionally. */ + return true; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is @@ -6926,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * missing in the guest EPT table (EPT12), the EPT violation * will be injected with nested_ept_inject_page_fault() */ - return 0; + return false; case EXIT_REASON_EPT_MISCONFIG: /* * L2 never uses directly L1's EPT, but rather L0's own EPT @@ -6934,13 +7576,21 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * (EPT on EPT). So any problems with the structure of the * table is L0's fault. */ - return 0; + return false; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: - return 1; + return true; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); default: - return 1; + return true; } } @@ -6950,6 +7600,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static int vmx_enable_pml(struct vcpu_vmx *vmx) +{ + struct page *pml_pg; + u32 exec_control; + + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!pml_pg) + return -ENOMEM; + + vmx->pml_pg = pml_pg; + + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + + return 0; +} + +static void vmx_disable_pml(struct vcpu_vmx *vmx) +{ + u32 exec_control; + + ASSERT(vmx->pml_pg); + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +} + +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +/* + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. + */ +static void kvm_flush_pml_buffers(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + /* + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -6960,6 +7693,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vmx); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -7028,14 +7771,20 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } - return 0; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; @@ -7073,14 +7822,34 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) vmx_set_msr_bitmap(vcpu); } +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently we do not handle the nested case where L2 has an + * APIC access page of its own; that page is still pinned. + * Hence, we skip the case where the VCPU is in guest mode _and_ + * L1 prepared an APIC access page for L2. + * + * For the case where L1 and L2 share the same APIC access page + * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear + * in the vmcs12), this function will only update either the vmcs01 + * or the vmcs02. If the former, the vmcs02 will be updated by + * prepare_vmcs02. If the latter, the vmcs01 will be updated in + * the next L2->L1 exit. + */ + if (!is_guest_mode(vcpu) || + !nested_cpu_has2(vmx->nested.current_vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmcs_write64(APIC_ACCESS_ADDR, hpa); +} + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) { u16 status; u8 old; - if (!vmx_vm_has_apicv(kvm)) - return; - if (isr == -1) isr = 0; @@ -7098,6 +7867,9 @@ static void vmx_set_rvi(int vector) u16 status; u8 old; + if (vector == -1) + vector = 0; + status = vmcs_read16(GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { @@ -7109,10 +7881,30 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + if (max_irr == -1) return; - vmx_set_rvi(max_irr); + /* + * In guest mode. If a vmexit is needed, vmx_check_nested_events + * handles it. + */ + if (nested_exit_on_intr(vcpu)) + return; + + /* + * Else, fall back to pre-APICv interrupt injection since L2 + * is run without virtual interrupt delivery. + */ + if (!kvm_event_needs_reinjection(vcpu) && + vmx_interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, max_irr, false); + vmx_inject_irq(vcpu); + } } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -7202,6 +7994,12 @@ static bool vmx_mpx_supported(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); } +static bool vmx_xsaves_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7336,7 +8134,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr; + unsigned long debugctlmsr, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -7347,6 +8145,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + if (vmx->nested.sync_shadow_vmcs) { copy_vmcs12_to_shadow(vmx); vmx->nested.sync_shadow_vmcs = false; @@ -7357,6 +8160,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->host_state.vmcs_host_cr4 = cr4; + } + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7520,13 +8329,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == &vmx->vmcs01) + return; + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_pml) + vmx_disable_pml(vmx); free_vpid(vmx); - free_loaded_vmcs(vmx->loaded_vmcs); + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); free_nested(vmx); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7548,6 +8377,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) + > PAGE_SIZE); + err = -ENOMEM; if (!vmx->guest_msrs) { goto uninit_vcpu; @@ -7581,16 +8413,30 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - err = -ENOMEM; - if (alloc_identity_pagetable(kvm) != 0) - goto free_vmcs; - if (!init_rmode_identity_map(kvm)) + err = init_rmode_identity_map(kvm); + if (err) goto free_vmcs; } + if (nested) + nested_vmx_setup_ctls_msrs(vmx); + + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + err = vmx_enable_pml(vmx); + if (err) + goto free_vmcs; + } + return &vmx->vcpu; free_vmcs: @@ -7679,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) exec_control); } } + if (nested && !vmx->rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; } /* Exposing INVPCID only when PCID is exposed */ @@ -7732,9 +8581,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, - nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.nested_vmx_ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -7747,6 +8597,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -7754,8 +8616,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -7763,6 +8624,82 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || + vmcs12->apic_access_addr >> maxphyaddr) + return false; + + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || + vmcs12->virtual_apic_page_addr >> maxphyaddr) + return false; + + if (vmx->nested.virtual_apic_page) /* shouldn't happen */ + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = + nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + + /* + * Failing the vm entry is _not_ what the processor does + * but it's basically the only possibility we have. + * We could still enter the guest if CR8 load exits are + * enabled, CR8 store exits are enabled, and virtualize APIC + * access is disabled; in this case the processor would never + * use the TPR shadow and we could simply clear the bit from + * the execution control. But such a configuration is useless, + * so let's keep the code simple. + */ + if (!vmx->nested.virtual_apic_page) + return false; + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || + vmcs12->posted_intr_desc_addr >> maxphyaddr) + return false; + + if (vmx->nested.pi_desc_page) { /* shouldn't happen */ + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + } + vmx->nested.pi_desc_page = + nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); + if (!vmx->nested.pi_desc_page) + return false; + + vmx->nested.pi_desc = + (struct pi_desc *)kmap(vmx->nested.pi_desc_page); + if (!vmx->nested.pi_desc) { + nested_release_page_clean(vmx->nested.pi_desc_page); + return false; + } + vmx->nested.pi_desc = + (struct pi_desc *)((void *)vmx->nested.pi_desc + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } + + return true; +} + static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) { u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; @@ -7785,10 +8722,312 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + u64 addr; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { + WARN_ON(1); + return -EINVAL; + } + maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || + ((addr + PAGE_SIZE) >> maxphyaddr)) + return -EINVAL; + + return 0; +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); + if (!page) { + WARN_ON(1); + return false; + } + msr_bitmap = (unsigned long *)kmap(page); + if (!msr_bitmap) { + nested_release_page_clean(page); + WARN_ON(1); + return false; + } + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) + for (msr = 0x800; msr <= 0x8ff; msr++) + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + msr, MSR_TYPE_R); + /* TPR is allowed */ + nested_vmx_disable_intercept_for_msr(msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { + /* EOI and self-IPI are allowed */ + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + } else { + /* + * Enable reading intercept of all the x2apic + * MSRs. We should not rely on vmcs12 to do any + * optimizations here, it may have been modified + * by L1. + */ + for (msr = 0x800; msr <= 0x8ff; msr++) + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + msr, + MSR_TYPE_R); + + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + kunmap(page); + nested_release_page_clean(page); + + return true; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + vmcs12->posted_intr_nv & 0xff00)) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field) +{ + int maxphyaddr; + u64 count, addr; + + if (vmcs12_read_any(vcpu, count_field, &count) || + vmcs12_read_any(vcpu, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_warn_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR)) + return -EINVAL; + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + if (kvm_get_msr(vcpu, e.index, &e.value)) { + pr_warn_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_write_guest(vcpu->kvm, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &e.value, sizeof(e.value))) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + return -EINVAL; + } + } + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various @@ -7836,7 +9075,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -7846,19 +9091,35 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | - PIN_BASED_POSTED_INTR); + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + if (nested_cpu_has_posted_intr(vmcs12)) { + /* + * Note that we use L0's vector here and in + * vmx_deliver_nested_posted_interrupt. + */ + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + vmx->nested.pi_pending = false; + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } else + exec_control &= ~PIN_BASED_POSTED_INTR; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -7896,24 +9157,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = - nested_get_page(vcpu, vmcs12->apic_access_addr); - /* * If translation failed, no matter: This feature asks * to exit when accessing the given address, and if it * can never be accessed, this feature won't do @@ -7925,11 +9177,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); - } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && + (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vcpu->kvm->arch.apic_access_page)); + kvm_vcpu_reload_apic_access_page(vcpu); + } + + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + vmcs_write64(EOI_EXIT_BITMAP0, + vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, + vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, + vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, + vmcs12->eoi_exit_bitmap3); + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -7958,11 +9223,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->nested.virtual_apic_page)); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } + + if (cpu_has_vmx_msr_bitmap() && + exec_control & CPU_BASED_USE_MSR_BITMAPS) { + nested_vmx_merge_msr_bitmap(vcpu, vmcs12); + /* MSR_BITMAP will be set by following vmx_set_efer. */ + } else + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + /* - * Merging of IO and MSR bitmaps not currently supported. + * Merging of IO bitmap not currently supported. * Rather, exit every time. */ - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; @@ -8078,6 +9356,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) int cpu; struct loaded_vmcs *vmcs02; bool ia32e; + u32 msr_entry_idx; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -8112,39 +9391,41 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { - /*TODO: Also verify bits beyond physical address width are 0*/ + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { - /*TODO: Also verify bits beyond physical address width are 0*/ + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8157,7 +9438,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || + if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -8221,6 +9502,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + cpu = get_cpu(); vmx->loaded_vmcs = vmcs02; vmx_vcpu_put(vcpu); @@ -8230,12 +9514,23 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - vmcs12->launch_state = 1; - prepare_vmcs02(vcpu, vmcs12); + msr_entry_idx = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (msr_entry_idx) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); + return 1; + } + + vmcs12->launch_state = 1; + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) - return kvm_emulate_halt(vcpu); + return kvm_vcpu_halt(vcpu); vmx->nested.nested_run_pending = 1; @@ -8360,9 +9655,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (vmx->nested.nested_run_pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; } - return 0; + return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) @@ -8398,7 +9694,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); @@ -8473,13 +9768,20 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + } + /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) @@ -8489,6 +9791,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (vmx_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); /* update exit information fields: */ @@ -8658,6 +9962,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* @@ -8670,7 +9981,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* trying to cancel vmlaunch/vmresume is a bug */ @@ -8680,6 +9990,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + + vmx_load_vmcs01(vcpu); + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); @@ -8695,13 +10011,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); vmx_segment_cache_clear(vmx); @@ -8721,10 +10030,26 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; } /* + * We are now running in L2, mmu_notifier will force to reload the + * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. + */ + kvm_vcpu_reload_apic_access_page(vcpu); + + /* * Exiting from L2 to L1, we're now back to L1 which thinks it just * finished a VMLAUNCH or VMRESUME instruction, so we need to set the * success or failure flag accordingly. @@ -8777,6 +10102,37 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (ple_gap) + shrink_ple_window(vcpu); +} + +static void vmx_slot_enable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); +} + +static void vmx_slot_disable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_set_dirty(kvm, slot); +} + +static void vmx_flush_log_dirty(struct kvm *kvm) +{ + kvm_flush_pml_buffers(kvm); +} + +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t offset, unsigned long mask) +{ + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -8821,7 +10177,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -8844,6 +10199,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .vm_has_apicv = vmx_vm_has_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, @@ -8880,156 +10236,37 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, + .xsaves_supported = vmx_xsaves_supported, .check_nested_events = vmx_check_nested_events, + + .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, }; static int __init vmx_init(void) { - int r, i, msr; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < NR_VMX_MSR; ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - r = -ENOMEM; - - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out3; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out5; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out6; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out7; + return r; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - return 0; - -out7: - free_page((unsigned long)vmx_vmwrite_bitmap); -out6: - free_page((unsigned long)vmx_vmread_bitmap); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); -out: - free_page((unsigned long)vmx_io_bitmap_a); - return r; } static void __exit vmx_exit(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); - #ifdef CONFIG_KEXEC - rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif |