diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 431 |
1 files changed, 331 insertions, 100 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bfe11cf124a1..3e556c68351b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -125,14 +125,32 @@ module_param(nested, bool, S_IRUGO); * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_GAP 128 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 +#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 +#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ + INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW + static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +/* Default doubles per-vcpu window every exit. */ +static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, int, S_IRUGO); + +/* Default resets per-vcpu window every exit to ple_window. */ +static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, int, S_IRUGO); + +/* Default is to compute the maximum so we can never overflow. */ +static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, int, S_IRUGO); + extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 @@ -379,6 +397,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + struct page *virtual_apic_page; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; @@ -453,6 +472,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -484,6 +504,10 @@ struct vcpu_vmx { /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; }; enum segment_cache_field { @@ -533,6 +557,7 @@ static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static unsigned long shadow_read_write_fields[] = { + TPR_THRESHOLD, GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -743,6 +768,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static int alloc_identity_pagetable(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1601,7 +1627,7 @@ static void reload_tss(void) /* * VT restores TR but not its size. Useless. */ - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *descs; descs = (void *)gdt->address; @@ -1647,7 +1673,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -1777,7 +1803,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) */ if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) stts(); - load_gdt(&__get_cpu_var(host_gdt)); + load_gdt(this_cpu_ptr(&host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -1807,7 +1833,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -2135,7 +2161,7 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { u64 tsc_offset; @@ -2330,7 +2356,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | + CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2601,6 +2627,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; @@ -2631,12 +2659,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: msr = find_msr_entry(vmx, msr_index); if (msr) { + u64 old_msr_data = msr->data; msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); preempt_enable(); + if (ret) + msr->data = old_msr_data; } break; } @@ -2704,7 +2735,7 @@ static void kvm_cpu_vmxon(u64 addr) : "memory", "cc"); } -static int hardware_enable(void *garbage) +static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2744,7 +2775,7 @@ static int hardware_enable(void *garbage) ept_sync_global(); } - native_store_gdt(&__get_cpu_var(host_gdt)); + native_store_gdt(this_cpu_ptr(&host_gdt)); return 0; } @@ -2768,7 +2799,7 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); } -static void hardware_disable(void *garbage) +static void hardware_disable(void) { if (vmm_exclusive) { vmclear_local_loaded_vmcss(); @@ -3107,9 +3138,17 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; - if (!cpu_has_vmx_flexpriority()) + if (!cpu_has_vmx_flexpriority()) { flexpriority_enabled = 0; + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if the + * processor does not have the APIC_ACCESS_ADDR VMCS field. + */ + kvm_x86_ops->set_apic_access_page_addr = NULL; + } + if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; @@ -3905,7 +3944,7 @@ static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; u16 data = 0; - int r, idx, ret = 0; + int idx, r; idx = srcu_read_lock(&kvm->srcu); fn = kvm->arch.tss_addr >> PAGE_SHIFT; @@ -3927,32 +3966,32 @@ static int init_rmode_tss(struct kvm *kvm) r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, sizeof(u8)); - if (r < 0) - goto out; - - ret = 1; out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + return r; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, idx, r, ret; + int i, idx, r = 0; pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) - return 1; - if (unlikely(!kvm->arch.ept_identity_pagetable)) { - printk(KERN_ERR "EPT: identity-mapping pagetable " - "haven't been allocated!\n"); return 0; - } + + /* Protect kvm->arch.ept_identity_pagetable_done. */ + mutex_lock(&kvm->slots_lock); + if (likely(kvm->arch.ept_identity_pagetable_done)) - return 1; - ret = 0; + goto out2; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + + r = alloc_identity_pagetable(kvm); + if (r < 0) + goto out2; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) @@ -3967,10 +4006,13 @@ static int init_rmode_identity_map(struct kvm *kvm) goto out; } kvm->arch.ept_identity_pagetable_done = true; - ret = 1; + out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + +out2: + mutex_unlock(&kvm->slots_lock); + return r; } static void seg_setup(int seg) @@ -3995,23 +4037,28 @@ static int alloc_apic_access_page(struct kvm *kvm) int r = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page) + if (kvm->arch.apic_access_page_done) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; - page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { r = -EFAULT; goto out; } - kvm->arch.apic_access_page = page; + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -4019,31 +4066,20 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { - struct page *page; + /* Called with kvm->slots_lock held. */ + struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(&kvm->slots_lock); - if (kvm->arch.ept_identity_pagetable) - goto out; + BUG_ON(kvm->arch.ept_identity_pagetable_done); + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); - if (r) - goto out; - - page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - kvm->arch.ept_identity_pagetable = page; -out: - mutex_unlock(&kvm->slots_lock); return r; } @@ -4235,11 +4271,16 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; + unsigned long cr4; vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = read_cr4(); + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmx->host_state.vmcs_host_cr4 = cr4; + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* @@ -4402,7 +4443,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); - vmcs_write32(PLE_WINDOW, ple_window); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); @@ -4477,7 +4519,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&vmx->vcpu)) apic_base_msr.data |= MSR_IA32_APICBASE_BSP; apic_base_msr.host_initiated = true; @@ -4537,9 +4579,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); @@ -4729,10 +4769,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; - if (!init_rmode_tss(kvm)) - return -ENOMEM; - - return 0; + return init_rmode_tss(kvm); } static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) @@ -5257,7 +5294,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) msr.data = data; msr.index = ecx; msr.host_initiated = false; - if (vmx_set_msr(vcpu, &msr) != 0) { + if (kvm_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -5521,17 +5558,18 @@ static u64 ept_rsvd_mask(u64 spte, int level) for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) mask |= (1ULL << i); - if (level > 2) + if (level == 4) /* bits 7:3 reserved */ mask |= 0xf8; - else if (level == 2) { - if (spte & (1ULL << 7)) - /* 2MB ref, bits 20:12 reserved */ - mask |= 0x1ff000; - else - /* bits 6:3 reserved */ - mask |= 0x78; - } + else if (spte & (1ULL << 7)) + /* + * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, + * level == 1 if the hypervisor is using the ignored bit 7. + */ + mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; + else if (level > 1) + /* bits 6:3 reserved */ + mask |= 0x78; return mask; } @@ -5561,7 +5599,8 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, WARN_ON(1); } - if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + /* bits 5:3 are _not_ reserved for large page or leaf page */ + if ((rsvd_bits & 0x38) == 0) { u64 ept_mem_type = (spte & 0x38) >> 3; if (ept_mem_type == 2 || ept_mem_type == 3 || @@ -5676,12 +5715,85 @@ out: return ret; } +static int __grow_ple_window(int val) +{ + if (ple_window_grow < 1) + return ple_window; + + val = min(val, ple_window_actual_max); + + if (ple_window_grow < ple_window) + val *= ple_window_grow; + else + val += ple_window_grow; + + return val; +} + +static int __shrink_ple_window(int val, int modifier, int minimum) +{ + if (modifier < 1) + return ple_window; + + if (modifier < ple_window) + val /= modifier; + else + val -= modifier; + + return max(val, minimum); +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, + ple_window_shrink, ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * ple_window_actual_max is computed to be one grow_ple_window() below + * ple_window_max. (See __grow_ple_window for the reason.) + * This prevents overflows, because ple_window_max is int. + * ple_window_max effectively rounded down to a multiple of ple_window_grow in + * this process. + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. + */ +static void update_ple_window_actual_max(void) +{ + ple_window_actual_max = + __shrink_ple_window(max(ple_window_max, ple_window), + ple_window_grow, INT_MIN); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ static int handle_pause(struct kvm_vcpu *vcpu) { + if (ple_gap) + grow_ple_window(vcpu); + skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); @@ -6146,7 +6258,11 @@ static void free_nested(struct vcpu_vmx *vmx) /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; } nested_free_all_saved_vmcss(vmx); @@ -6310,6 +6426,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + preempt_disable(); + vmcs_load(shadow_vmcs); for (i = 0; i < num_fields; i++) { @@ -6333,6 +6451,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); } static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) @@ -6617,7 +6737,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: @@ -6630,6 +6750,12 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6675,6 +6801,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, }; static const int kvm_vmx_max_exit_handlers = @@ -6892,6 +7019,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return 1; case EXIT_REASON_CPUID: + if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) + return 0; return 1; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -6908,7 +7037,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6936,7 +7065,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_MCE_DURING_VMENTRY: return 0; case EXIT_REASON_TPR_BELOW_THRESHOLD: - return 1; + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); @@ -7049,14 +7178,20 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } - return 0; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; @@ -7094,6 +7229,29 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) vmx_set_msr_bitmap(vcpu); } +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently we do not handle the nested case where L2 has an + * APIC access page of its own; that page is still pinned. + * Hence, we skip the case where the VCPU is in guest mode _and_ + * L1 prepared an APIC access page for L2. + * + * For the case where L1 and L2 share the same APIC access page + * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear + * in the vmcs12), this function will only update either the vmcs01 + * or the vmcs02. If the former, the vmcs02 will be updated by + * prepare_vmcs02. If the latter, the vmcs01 will be updated in + * the next L2->L1 exit. + */ + if (!is_guest_mode(vcpu) || + !nested_cpu_has2(vmx->nested.current_vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmcs_write64(APIC_ACCESS_ADDR, hpa); +} + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) { u16 status; @@ -7376,7 +7534,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr; + unsigned long debugctlmsr, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -7387,6 +7545,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + if (vmx->nested.sync_shadow_vmcs) { copy_vmcs12_to_shadow(vmx); vmx->nested.sync_shadow_vmcs = false; @@ -7397,6 +7560,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr4 = read_cr4(); + if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->host_state.vmcs_host_cr4 = cr4; + } + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7642,10 +7811,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - err = -ENOMEM; - if (alloc_identity_pagetable(kvm) != 0) - goto free_vmcs; - if (!init_rmode_identity_map(kvm)) + err = init_rmode_identity_map(kvm); + if (err) goto free_vmcs; } @@ -7824,6 +7991,55 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* TODO: Also verify bits beyond physical address width are 0 */ + if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) + return false; + + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + /* TODO: Also verify bits beyond physical address width are 0 */ + if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) + return false; + + if (vmx->nested.virtual_apic_page) /* shouldn't happen */ + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = + nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + + /* + * Failing the vm entry is _not_ what the processor does + * but it's basically the only possibility we have. + * We could still enter the guest if CR8 load exits are + * enabled, CR8 store exits are enabled, and virtualize APIC + * access is disabled; in this case the processor would never + * use the TPR shadow and we could simply clear the bit from + * the execution control. But such a configuration is useless, + * so let's keep the code simple. + */ + if (!vmx->nested.virtual_apic_page) + return false; + } + + return true; +} + static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) { u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; @@ -7849,7 +8065,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various @@ -7970,16 +8186,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = - nested_get_page(vcpu, vmcs12->apic_access_addr); - /* * If translation failed, no matter: This feature asks * to exit when accessing the given address, and if it * can never be accessed, this feature won't do @@ -7994,8 +8200,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vcpu->kvm->arch.apic_access_page)); + kvm_vcpu_reload_apic_access_page(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -8024,6 +8229,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->nested.virtual_apic_page)); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } + /* * Merging of IO and MSR bitmaps not currently supported. * Rather, exit every time. @@ -8185,8 +8397,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !PAGE_ALIGNED(vmcs12->apic_access_addr)) { + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8790,10 +9001,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; } /* + * We are now running in L2, mmu_notifier will force to reload the + * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. + */ + kvm_vcpu_reload_apic_access_page(vcpu); + + /* * Exiting from L2 to L1, we're now back to L1 which thinks it just * finished a VMLAUNCH or VMRESUME instruction, so we need to set the * success or failure flag accordingly. @@ -8846,6 +9067,12 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (ple_gap) + shrink_ple_window(vcpu); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -8890,7 +9117,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -8913,6 +9139,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .vm_has_apicv = vmx_vm_has_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, @@ -8951,6 +9178,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .mpx_supported = vmx_mpx_supported, .check_nested_events = vmx_check_nested_events, + + .sched_in = vmx_sched_in, }; static int __init vmx_init(void) @@ -9065,6 +9294,8 @@ static int __init vmx_init(void) } else kvm_disable_tdp(); + update_ple_window_actual_max(); + return 0; out7: @@ -9098,7 +9329,7 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_vmread_bitmap); #ifdef CONFIG_KEXEC - rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif |