From 7b6f8a06e482960ba6ab06faba51c8f3727a5c7b Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 3 May 2019 03:08:52 -0700 Subject: kvm: x86: Move kvm_set_mmio_spte_mask() from x86.c to mmu.c As a prerequisite to fix several SPTE reserved bits related calculation errors caused by MKTME, which requires kvm_set_mmio_spte_mask() to use local static variable defined in mmu.c. Also move call site of kvm_set_mmio_spte_mask() from kvm_arch_init() to kvm_mmu_module_init() so that kvm_set_mmio_spte_mask() can be static. Reviewed-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 31 ------------------------------- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1e9ba81accba..b1d4f55395cd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5998,6 +5998,35 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + int maxphyaddr = boot_cpu_data.x86_phys_bits; + + /* + * Set the reserved bits and the present bit of an paging-structure + * entry to generate page fault with PFER.RSV = 1. + */ + + /* + * Mask the uppermost physical address bit, which would be reserved as + * long as the supported physical address width is less than 52. + */ + mask = 1ull << 51; + + /* Set the present bit. */ + mask |= 1ull; + + /* + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ + if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) + mask &= ~1ull; + + kvm_mmu_set_mmio_spte_mask(mask, mask); +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; @@ -6014,6 +6043,8 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); + kvm_set_mmio_spte_mask(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 83aefd759846..f3c0f2b63d76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6910,35 +6910,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask); -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -7035,8 +7006,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_set_mmio_spte_mask(); - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, -- cgit v1.2.3 From f3ecb59dd49f1742b97df6ba071aaa3d031154ac Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 3 May 2019 03:08:53 -0700 Subject: kvm: x86: Fix reserved bits related calculation errors caused by MKTME Intel MKTME repurposes several high bits of physical address as 'keyID' for memory encryption thus effectively reduces platform's maximum physical address bits. Exactly how many bits are reduced is configured by BIOS. To honor such HW behavior, the repurposed bits are reduced from cpuinfo_x86->x86_phys_bits when MKTME is detected in CPU detection. Similarly, AMD SME/SEV also reduces physical address bits for memory encryption, and cpuinfo->x86_phys_bits is reduced too when SME/SEV is detected, so for both MKTME and SME/SEV, boot_cpu_data.x86_phys_bits doesn't hold physical address bits reported by CPUID anymore. Currently KVM treats bits from boot_cpu_data.x86_phys_bits to 51 as reserved bits, but it's not true anymore for MKTME, since MKTME treats those reduced bits as 'keyID', but not reserved bits. Therefore boot_cpu_data.x86_phys_bits cannot be used to calculate reserved bits anymore, although we can still use it for AMD SME/SEV since SME/SEV treats the reduced bits differently -- they are treated as reserved bits, the same as other reserved bits in page table entity [1]. Fix by introducing a new 'shadow_phys_bits' variable in KVM x86 MMU code to store the effective physical bits w/o reserved bits -- for MKTME, it equals to physical address reported by CPUID, and for SME/SEV, it is boot_cpu_data.x86_phys_bits. Note that for the physical address bits reported to guest should remain unchanged -- KVM should report physical address reported by CPUID to guest, but not boot_cpu_data.x86_phys_bits. Because for Intel MKTME, there's no harm if guest sets up 'keyID' bits in guest page table (since MKTME only works at physical address level), and KVM doesn't even expose MKTME to guest. Arguably, for AMD SME/SEV, guest is aware of SEV thus it should adjust boot_cpu_data.x86_phys_bits when it detects SEV, therefore KVM should still reports physcial address reported by CPUID to guest. Reviewed-by: Sean Christopherson Signed-off-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1d4f55395cd..95ac393e2959 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -262,6 +262,11 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; */ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +static u8 __read_mostly shadow_phys_bits; static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role @@ -471,6 +476,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected + * in CPU detection code, but MKTME treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore for MKTME + * we should still return physical address bits reported by CPUID. + */ + if (!boot_cpu_has(X86_FEATURE_TME) || + WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) + return boot_cpu_data.x86_phys_bits; + + return cpuid_eax(0x80000008) & 0xff; +} + static void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; @@ -484,6 +504,8 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_present_mask = 0; shadow_acc_track_mask = 0; + shadow_phys_bits = kvm_get_shadow_phys_bits(); + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is @@ -4497,7 +4519,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4534,13 +4556,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, false); if (!shadow_me_mask) @@ -4561,7 +4583,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, execonly); + shadow_phys_bits, execonly); } #define BYTE_MASK(access) \ @@ -6001,7 +6023,6 @@ static void mmu_destroy_caches(void) static void kvm_set_mmio_spte_mask(void) { u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; /* * Set the reserved bits and the present bit of an paging-structure @@ -6021,7 +6042,7 @@ static void kvm_set_mmio_spte_mask(void) * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) + if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) mask &= ~1ull; kvm_mmu_set_mmio_spte_mask(mask, mask); -- cgit v1.2.3 From 8f38302c0be2d2daf3b40f7d2142ec77e35d209e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 4 Jun 2019 18:09:39 +0200 Subject: KVM/nSVM: properly map nested VMCB Commit 8c5fbf1a7231 ("KVM/nSVM: Use the new mapping API for mapping guest memory") broke nested SVM completely: kvm_vcpu_map()'s second parameter is GFN so vmcb_gpa needs to be converted with gpa_to_gfn(), not the other way around. Fixes: 8c5fbf1a7231 ("KVM/nSVM: Use the new mapping API for mapping guest memory") Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 735b8c01895e..5beca1030c9a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3293,7 +3293,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info_err, KVM_ISA_SVM); - rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); @@ -3583,7 +3583,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) vmcb_gpa = svm->vmcb->save.rax; - rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); -- cgit v1.2.3 From 84ea3acaa01fb90861b341038998e27a5198e1a0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 20 May 2019 16:18:05 +0800 Subject: KVM: LAPIC: Extract adaptive tune timer advancement logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract adaptive tune timer advancement logic to a single function. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li [Rename new function. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 57 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4924f83ed4f3..c12b090f4fad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1501,11 +1501,40 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) } } -void wait_lapic_expire(struct kvm_vcpu *vcpu) +static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, + u64 guest_tsc, u64 tsc_deadline) { struct kvm_lapic *apic = vcpu->arch.apic; u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; - u64 guest_tsc, tsc_deadline, ns; + u64 ns; + + /* too early */ + if (guest_tsc < tsc_deadline) { + ns = (tsc_deadline - guest_tsc) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } else { + /* too late */ + ns = (guest_tsc - tsc_deadline) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } + + if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = 0; + apic->lapic_timer.timer_advance_adjust_done = true; + } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; +} + +void wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; if (apic->lapic_timer.expired_tscdeadline == 0) return; @@ -1521,28 +1550,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (!apic->lapic_timer.timer_advance_adjust_done) { - /* too early */ - if (guest_tsc < tsc_deadline) { - ns = (tsc_deadline - guest_tsc) * 1000000ULL; - do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); - } else { - /* too late */ - ns = (guest_tsc - tsc_deadline) * 1000000ULL; - do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); - } - if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; - if (unlikely(timer_advance_ns > 5000)) { - timer_advance_ns = 0; - apic->lapic_timer.timer_advance_adjust_done = true; - } - apic->lapic_timer.timer_advance_ns = timer_advance_ns; - } + if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + adjust_lapic_timer_advance(vcpu, guest_tsc, tsc_deadline); } static void start_sw_tscdeadline(struct kvm_lapic *apic) -- cgit v1.2.3 From ec0671d5684aca3326269439398e47790f1c6e7e Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 20 May 2019 16:18:08 +0800 Subject: KVM: LAPIC: Delay trace_kvm_wait_lapic_expire tracepoint to after vmexit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wait_lapic_expire() call was moved above guest_enter_irqoff() because of its tracepoint, which violated the RCU extended quiescent state invoked by guest_enter_irqoff()[1][2]. This patch simply moves the tracepoint below guest_exit_irqoff() in vcpu_enter_guest(). Snapshot the delta before VM-Enter, but trace it after VM-Exit. This can help us to move wait_lapic_expire() just before vmentry in the later patch. [1] Commit 8b89fe1f6c43 ("kvm: x86: move tracepoints outside extended quiescent state") [2] https://patchwork.kernel.org/patch/7821111/ Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Suggested-by: Sean Christopherson Signed-off-by: Wanpeng Li [Track whether wait_lapic_expire was called, and do not invoke the tracepoint if not. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 14 +++++++------- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 7 +++++++ 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c12b090f4fad..f8615872ae64 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1502,27 +1502,27 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) } static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, - u64 guest_tsc, u64 tsc_deadline) + s64 advance_expire_delta) { struct kvm_lapic *apic = vcpu->arch.apic; u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 ns; /* too early */ - if (guest_tsc < tsc_deadline) { - ns = (tsc_deadline - guest_tsc) * 1000000ULL; + if (advance_expire_delta < 0) { + ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); timer_advance_ns -= min((u32)ns, timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } else { /* too late */ - ns = (guest_tsc - tsc_deadline) * 1000000ULL; + ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); timer_advance_ns += min((u32)ns, timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } - if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) + if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) apic->lapic_timer.timer_advance_adjust_done = true; if (unlikely(timer_advance_ns > 5000)) { timer_advance_ns = 0; @@ -1545,13 +1545,13 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); + apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) - adjust_lapic_timer_advance(vcpu, guest_tsc, tsc_deadline); + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } static void start_sw_tscdeadline(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d6d049ba3045..3e72a255543d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -32,6 +32,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; u32 timer_advance_ns; + s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; bool timer_advance_adjust_done; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f3c0f2b63d76..7a26aa5b0861 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7980,6 +7980,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; guest_exit_irqoff(); + if (lapic_in_kernel(vcpu)) { + s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; + if (delta != S64_MIN) { + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); + vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; + } + } local_irq_enable(); preempt_enable(); -- cgit v1.2.3 From b6c4bc659c6f3b7f2b6c3a330ae36f1cfd69d73e Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 20 May 2019 16:18:09 +0800 Subject: KVM: LAPIC: Optimize timer latency further MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Advance lapic timer tries to hidden the hypervisor overhead between the host emulated timer fires and the guest awares the timer is fired. However, it just hidden the time between apic_timer_fn/handle_preemption_timer -> wait_lapic_expire, instead of the real position of vmentry which is mentioned in the orignial commit d0659d946be0 ("KVM: x86: add option to advance tscdeadline hrtimer expiration"). There is 700+ cpu cycles between the end of wait_lapic_expire and before world switch on my haswell desktop. This patch tries to narrow the last gap(wait_lapic_expire -> world switch), it takes the real overhead time between apic_timer_fn/handle_preemption_timer and before world switch into consideration when adaptively tuning timer advancement. The patch can reduce 40% latency (~1600+ cycles to ~1000+ cycles on a haswell desktop) for kvm-unit-tests/tscdeadline_latency when testing busy waits. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/svm.c | 4 ++++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 3 --- 5 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f8615872ae64..fcf42a340790 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1531,7 +1531,7 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, apic->lapic_timer.timer_advance_ns = timer_advance_ns; } -void wait_lapic_expire(struct kvm_vcpu *vcpu) +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; @@ -1553,6 +1553,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } +EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); static void start_sw_tscdeadline(struct kvm_lapic *apic) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 3e72a255543d..f974a3d5a44d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -220,7 +220,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); -void wait_lapic_expire(struct kvm_vcpu *vcpu); +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5beca1030c9a..a7ea34bed3fe 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5643,6 +5643,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xcr0(vcpu); + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + kvm_wait_lapic_expire(vcpu); + /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b93e36ddee5e..b35b3800a3c0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6445,6 +6445,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_update_hv_timer(vcpu); + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + kvm_wait_lapic_expire(vcpu); + /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7a26aa5b0861..2a713a74ca2e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7922,9 +7922,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - wait_lapic_expire(vcpu); guest_enter_irqoff(); fpregs_assert_state_consistent(); -- cgit v1.2.3 From 0532dd52dfec0cf774188d3e692d50c197bc4210 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Fri, 3 May 2019 06:38:53 -0700 Subject: kvm: svm/avic: Do not send AVIC doorbell to self AVIC doorbell is used to notify a running vCPU that interrupts has been injected into the vCPU AVIC backing page. Current logic checks only if a VCPU is running before sending a doorbell. However, the doorbell is not necessary if the destination CPU is itself. Add logic to check currently running CPU before sending doorbell. Signed-off-by: Suravee Suthikulpanit Reviewed-by: Alexander Graf Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a7ea34bed3fe..c56f40d430e5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5163,10 +5163,13 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_lapic_set_irr(vec, vcpu->arch.apic); smp_mb__after_atomic(); - if (avic_vcpu_is_running(vcpu)) - wrmsrl(SVM_AVIC_DOORBELL, - kvm_cpu_get_apicid(vcpu->cpu)); - else + if (avic_vcpu_is_running(vcpu)) { + int cpuid = vcpu->cpu; + + if (cpuid != get_cpu()) + wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); + put_cpu(); + } else kvm_vcpu_wake_up(vcpu); } -- cgit v1.2.3 From f257d6dcda0187693407e0c2e5dab69bdab3223f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2019 22:18:17 -0700 Subject: KVM: Directly return result from kvm_arch_check_processor_compat() Add a wrapper to invoke kvm_arch_check_processor_compat() so that the boilerplate ugliness of checking virtualization support on all CPUs is hidden from the arch specific code. x86's implementation in particular is quite heinous, as it unnecessarily propagates the out-param pattern into kvm_x86_ops. While the x86 specific issue could be resolved solely by changing kvm_x86_ops, make the change for all architectures as returning a value directly is prettier and technically more robust, e.g. s390 doesn't set the out param, which could lead to subtle breakage in the (highly unlikely) scenario where the out-param was not pre-initialized by the caller. Opportunistically annotate svm_check_processor_compat() with __init. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/kvm-s390.c | 5 +++++ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 4 ++-- virt/kvm/kvm_main.c | 9 ++++++--- 11 files changed, 27 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 0369f26ab96d..2cfe839f0b3a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -123,9 +123,9 @@ int kvm_arch_hardware_setup(void) return 0; } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - *(int *)rtn = 0; + return 0; } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index aa3a678711be..628d3c791ad7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -425,9 +425,9 @@ int kvm_arch_hardware_setup(void) return 0; } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - *(int *)rtn = kvmppc_core_check_processor_compat(); + return kvmppc_core_check_processor_compat(); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2b00a3ebee08..da5825a3c16b 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -905,7 +905,6 @@ extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_check_processor_compat(void *rtn) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 28ebd647784c..7936af0a971f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -227,6 +227,11 @@ int kvm_arch_hardware_enable(void) return 0; } +int kvm_arch_check_processor_compat(void) +{ + return 0; +} + static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 450d69a1e6fa..d5457c7bb243 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -999,7 +999,7 @@ struct kvm_x86_ops { int (*disabled_by_bios)(void); /* __init */ int (*hardware_enable)(void); void (*hardware_disable)(void); - void (*check_processor_compatibility)(void *rtn); + int (*check_processor_compatibility)(void);/* __init */ int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c56f40d430e5..302cb409d452 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5871,9 +5871,9 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static void svm_check_processor_compat(void *rtn) +static int __init svm_check_processor_compat(void) { - *(int *)rtn = 0; + return 0; } static bool svm_cpu_has_accelerated_tpr(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b35b3800a3c0..0861c71a4379 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6733,22 +6733,22 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static void __init vmx_check_processor_compat(void *rtn) +static int __init vmx_check_processor_compat(void) { struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; - *(int *)rtn = 0; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - *(int *)rtn = -EIO; + return -EIO; if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, enable_apicv); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); - *(int *)rtn = -EIO; + return -EIO; } + return 0; } static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a713a74ca2e..5cb9ac9b61ab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9082,9 +9082,9 @@ void kvm_arch_hardware_unsetup(void) kvm_x86_ops->hardware_unsetup(); } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - kvm_x86_ops->check_processor_compatibility(rtn); + return kvm_x86_ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79fa4426509c..5e9fd7ad8018 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -870,7 +870,7 @@ int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); int kvm_arch_hardware_setup(void); void kvm_arch_hardware_unsetup(void); -void kvm_arch_check_processor_compat(void *rtn); +int kvm_arch_check_processor_compat(void); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 7eeebe5e9da2..d2389033e9d6 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -105,9 +105,9 @@ int kvm_arch_hardware_setup(void) return 0; } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - *(int *)rtn = 0; + return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ca54b09adf5b..b2579841263f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4224,6 +4224,11 @@ static void kvm_sched_out(struct preempt_notifier *pn, kvm_arch_vcpu_put(vcpu); } +static void check_processor_compat(void *rtn) +{ + *(int *)rtn = kvm_arch_check_processor_compat(); +} + int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, struct module *module) { @@ -4255,9 +4260,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_0a; for_each_online_cpu(cpu) { - smp_call_function_single(cpu, - kvm_arch_check_processor_compat, - &r, 1); + smp_call_function_single(cpu, check_processor_compat, &r, 1); if (r < 0) goto out_free_1; } -- cgit v1.2.3 From 4d22c17c17d228b7f43e51293c7bb7dac87dea40 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 19 Apr 2019 10:16:24 +0800 Subject: kvm: x86: refine kvm_get_arch_capabilities() 1. Using X86_FEATURE_ARCH_CAPABILITIES to enumerate the existence of MSR_IA32_ARCH_CAPABILITIES to avoid using rdmsrl_safe(). 2. Since kvm_get_arch_capabilities() is only used in this file, making it static. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d5457c7bb243..15e973d9b840 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1532,7 +1532,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5cb9ac9b61ab..8aa6b5a75e7a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1212,11 +1212,12 @@ static u32 msr_based_features[] = { static unsigned int num_msr_based_features; -u64 kvm_get_arch_capabilities(void) +static u64 kvm_get_arch_capabilities(void) { - u64 data; + u64 data = 0; - rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* * If we're doing cache flushes (either "always" or "cond") @@ -1232,7 +1233,6 @@ u64 kvm_get_arch_capabilities(void) return data; } -EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { -- cgit v1.2.3 From b51700632e0e53254733ff706e5bdca22d19dbe5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 May 2019 14:06:53 +0800 Subject: KVM: X86: Provide a capability to disable cstate msr read intercepts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow guest reads CORE cstate when exposing host CPU power management capabilities to the guest. PKG cstate is restricted to avoid a guest to get the whole package information in multi-tenant scenario. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 5 ++++- arch/x86/kvm/x86.h | 5 +++++ include/uapi/linux/kvm.h | 4 +++- tools/include/uapi/linux/kvm.h | 4 +++- 7 files changed, 23 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 33cd92dd6aa5..91fd86fcc49f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4894,6 +4894,7 @@ Valid bits in args[0] are #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15e973d9b840..aeadbc770eb2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -882,6 +882,7 @@ struct kvm_arch { bool mwait_in_guest; bool hlt_in_guest; bool pause_in_guest; + bool cstate_in_guest; unsigned long irq_sources_bitmap; s64 kvmclock_offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0861c71a4379..da24f1858acc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6637,6 +6637,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(kvm)) { + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8aa6b5a75e7a..17e9533f51eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3098,7 +3098,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -4615,6 +4616,8 @@ split_irqchip_unlock: kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0868c5..275b3b646023 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -333,6 +333,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm) return kvm->arch.pause_in_guest; } +static inline bool kvm_cstate_in_guest(struct kvm *kvm) +{ + return kvm->arch.cstate_in_guest; +} + DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2fe12b40d503..c2152f3dd02d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -696,9 +696,11 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) #define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ KVM_X86_DISABLE_EXITS_HLT | \ - KVM_X86_DISABLE_EXITS_PAUSE) + KVM_X86_DISABLE_EXITS_PAUSE | \ + KVM_X86_DISABLE_EXITS_CSTATE) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6d4ea4b6c922..ef3303f72c46 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -696,9 +696,11 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) +#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) #define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ KVM_X86_DISABLE_EXITS_HLT | \ - KVM_X86_DISABLE_EXITS_PAUSE) + KVM_X86_DISABLE_EXITS_PAUSE | \ + KVM_X86_DISABLE_EXITS_CSTATE) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { -- cgit v1.2.3 From 511a8556e3342af6a46eb9477936b29aa983f154 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 May 2019 14:06:54 +0800 Subject: KVM: X86: Emulate MSR_IA32_MISC_ENABLE MWAIT bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSR IA32_MISC_ENABLE bit 18, according to SDM: | When this bit is set to 0, the MONITOR feature flag is not set (CPUID.01H:ECX[bit 3] = 0). | This indicates that MONITOR/MWAIT are not supported. | | Software attempts to execute MONITOR/MWAIT will cause #UD when this bit is 0. | | When this bit is set to 1 (default), MONITOR/MWAIT are supported (CPUID.01H:ECX[bit 3] = 1). The CPUID.01H:ECX[bit 3] ought to mirror the value of the MSR bit, CPUID.01H:ECX[bit 3] is a better guard than kvm_mwait_in_guest(). kvm_mwait_in_guest() affects the behavior of MONITOR/MWAIT, not its guest visibility. This patch implements toggling of the CPUID bit based on guest writes to the MSR. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Liran Alon Cc: Konrad Rzeszutek Wilk Signed-off-by: Wanpeng Li [Fixes for backwards compatibility - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 9 +++++---- arch/x86/kvm/cpuid.c | 10 ++++++++++ arch/x86/kvm/x86.c | 10 +++++++++- 3 files changed, 24 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7a0e64ccd6ff..f9b021e16ebc 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,10 +378,11 @@ struct kvm_sync_regs { struct kvm_vcpu_events events; }; -#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) -#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) -#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e18a9f9f65b5..60f87ba2ccca 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -137,6 +137,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { + best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + if (best) { + if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT) + best->ecx |= F(MWAIT); + else + best->ecx &= ~F(MWAIT); + } + } + /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 17e9533f51eb..10feed6a01eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2547,7 +2547,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && + ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + return 1; + vcpu->arch.ia32_misc_enable_msr = data; + kvm_update_cpuid(vcpu); + } else { + vcpu->arch.ia32_misc_enable_msr = data; + } break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) -- cgit v1.2.3 From 5a253552a5108bcabc2eb4e5b2b86262232f17e7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 27 May 2019 02:45:44 -0600 Subject: x86/kvm/VMX: drop bad asm() clobber from nested_vmx_check_vmentry_hw() While upstream gcc doesn't detect conflicts on cc (yet), it really should, and hence "cc" should not be specified for asm()-s also having "=@cc" outputs. (It is quite pointless anyway to specify a "cc" clobber in x86 inline assembly, since the compiler assumes it to be always clobbered, and has no means [yet] to suppress this behavior.) Signed-off-by: Jan Beulich Fixes: bbc0b8239257 ("KVM: nVMX: Capture VM-Fail via CC_{SET,OUT} in nested early checks") Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1032f068f0b9..0f705c7d590c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2781,7 +2781,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" + : "memory" ); if (vmx->msr_autoload.host.nr) -- cgit v1.2.3 From 1ae4de23eddef465104277e79fa669bec7b3d288 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 2 Jun 2019 21:11:56 +0200 Subject: KVM: VMX: remove unneeded 'asm volatile ("")' from vmcs_write64 __vmcs_writel uses volatile asm, so there is no need to insert another one between the first and the second call to __vmcs_writel in order to prevent unwanted code moves for 32bit targets. Signed-off-by: Uros Bizjak Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/ops.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index b8e50f76fefc..2200fb698dd0 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -146,7 +146,6 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) __vmcs_writel(field, value); #ifndef CONFIG_X86_64 - asm volatile (""); __vmcs_writel(field+1, value >> 32); #endif } -- cgit v1.2.3 From 0d9ce162cf46c99628cc5da9510b959c7976735b Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 3 Jan 2019 17:14:28 -0800 Subject: kvm: Convert kvm_lock to a mutex It doesn't seem as if there is any particular need for kvm_lock to be a spinlock, so convert the lock to a mutex so that sleepable functions (in particular cond_resched()) can be called while holding it. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 4 +--- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 14 +++++++------- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 30 +++++++++++++++--------------- 6 files changed, 28 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 1bb8bcaf8497..635cd6eaf714 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows: On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. -For spinlocks, kvm_lock is taken outside kvm->mmu_lock. - Everything else is a leaf: no other lock is taken inside the critical sections. @@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above. ------------ Name: kvm_lock -Type: spinlock_t +Type: mutex Arch: any Protects: - vm_list diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7936af0a971f..0fef9192f6ac 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2423,13 +2423,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); if (!kvm->arch.sca) goto out_err; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); sca_offset += 16; if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) sca_offset = 0; kvm->arch.sca = (struct bsca_block *) ((char *) kvm->arch.sca + sca_offset); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); sprintf(debug_name, "kvm-%u", current->pid); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 95ac393e2959..3384c539d150 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5956,7 +5956,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -5998,7 +5998,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return freed; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 10feed6a01eb..6200d5a51f13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6719,7 +6719,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm_vcpu *vcpu; int cpu; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); @@ -6745,7 +6745,7 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } #endif @@ -6796,17 +6796,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) smp_call_function_single(cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) + if (vcpu->cpu != raw_smp_processor_id()) send_ipi = 1; } } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -6929,12 +6929,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5e9fd7ad8018..abafddb9fe2c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -162,7 +162,7 @@ static inline bool is_error_page(struct page *page) extern struct kmem_cache *kvm_vcpu_cache; -extern spinlock_t kvm_lock; +extern struct mutex kvm_lock; extern struct list_head vm_list; struct kvm_io_range { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b2579841263f..9613987ef4c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); +DEFINE_MUTEX(kvm_lock); static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); @@ -683,9 +683,9 @@ static struct kvm *kvm_create_vm(unsigned long type) if (r) goto out_err; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); preempt_notifier_inc(); @@ -731,9 +731,9 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); @@ -4034,13 +4034,13 @@ static int vm_stat_get(void *_offset, u64 *val) u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -4053,12 +4053,12 @@ static int vm_stat_clear(void *_offset, u64 val) if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vm_stat_clear_per_vm((void *)&stat_tmp, 0); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -4073,13 +4073,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) u64 tmp_val; *val = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); *val += tmp_val; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -4092,12 +4092,12 @@ static int vcpu_stat_clear(void *_offset, u64 val) if (val) return -EINVAL; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { stat_tmp.kvm = kvm; vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -4118,7 +4118,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (!kvm_dev.this_device || !kvm) return; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); if (type == KVM_EVENT_CREATE_VM) { kvm_createvm_count++; kvm_active_vms++; @@ -4127,7 +4127,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } created = kvm_createvm_count; active = kvm_active_vms; - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); if (!env) -- cgit v1.2.3 From 1dfdb45ec510ba27e366878f97484e9c9e728902 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Jun 2019 16:46:44 +0200 Subject: KVM: x86: clean up conditions for asynchronous page fault handling Even when asynchronous page fault is disabled, KVM does not want to pause the host if a guest triggers a page fault; instead it will put it into an artificial HLT state that allows running other host processes while allowing interrupt delivery into the guest. However, the way this feature is triggered is a bit confusing. First, it is not used for page faults while a nested guest is running: but this is not an issue since the artificial halt is completely invisible to the guest, either L1 or L2. Second, it is used even if kvm_halt_in_guest() returns true; in this case, the guest probably should not pay the additional latency cost of the artificial halt, and thus we should handle the page fault in a completely synchronous way. By introducing a new function kvm_can_deliver_async_pf, this patch commonizes the code that chooses whether to deliver an async page fault (kvm_arch_async_page_not_present) and the code that chooses whether a page fault should be handled synchronously (kvm_can_do_async_pf). Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 13 ------------- arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3384c539d150..771349e72d2a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4040,19 +4040,6 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) - return false; - - if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6200d5a51f13..279ab4e8dd82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9775,6 +9775,36 @@ static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) sizeof(u32)); } +static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + return false; + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + (vcpu->arch.apf.send_user_only && + kvm_x86_ops->get_cpl(vcpu) == 0)) + return false; + + return true; +} + +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!lapic_in_kernel(vcpu) || + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) + return false; + + if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) + return false; + + /* + * If interrupts are off we cannot even use an artificial + * halt state. + */ + return kvm_x86_ops->interrupt_allowed(vcpu); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -9783,11 +9813,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, trace_kvm_async_pf_not_present(work->arch.token, work->gva); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + if (kvm_can_deliver_async_pf(vcpu) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -9795,6 +9822,16 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + } else { + /* + * It is not possible to deliver a paravirtualized asynchronous + * page fault, but putting the guest in an artificial halt state + * can be beneficial nevertheless: if an interrupt arrives, we + * can deliver it timely and perhaps the guest will schedule + * another process. When the instruction that triggered a page + * fault is retried, hopefully the page will be ready in the host. + */ + kvm_make_request(KVM_REQ_APF_HALT, vcpu); } } -- cgit v1.2.3 From 1fc5d19472f77fc44f0c5b6852b18416f1db3fea Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Thu, 6 Jun 2019 01:54:47 +0300 Subject: KVM: x86: Use DR_TRAP_BITS instead of hard-coded 15 Make all code consistent with kvm_deliver_exception_payload() by using appropriate symbolic constant instead of hard-coded number. Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d0d5dd44b4f4..199cd2cf7254 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4260,7 +4260,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) ulong dr6; ctxt->ops->get_dr(ctxt, 6, &dr6); - dr6 &= ~15; + dr6 &= ~DR_TRAP_BITS; dr6 |= DR6_BD | DR6_RTM; ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da24f1858acc..f91323d527be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4521,7 +4521,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) skip_emulated_instruction(vcpu); @@ -4766,7 +4766,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= DR6_BD | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 279ab4e8dd82..eadd987ae350 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6392,7 +6392,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; -- cgit v1.2.3 From a87f2d3a6eadabad3ce3a8a57c1dd04c14b856ba Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 6 Jun 2019 09:18:45 +0800 Subject: KVM: x86: Add Intel CPUID.1F cpuid emulation support Add support to expose Intel V2 Extended Topology Enumeration Leaf for some new systems with multiple software-visible die within each package. Because unimplemented and unexposed leaves should be explicitly reported as zero, there is no need to limit cpuid.0.eax to the maximum value of feature configuration but limit it to the highest leaf implemented in the current code. A single clamping seems sufficient and cheaper. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Like Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 60f87ba2ccca..9872f86fdbe9 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -436,7 +436,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); + /* Limited to the highest leaf implemented in KVM. */ + entry->eax = min(entry->eax, 0x1fU); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -556,7 +557,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = edx.full; break; } - /* function 0xb has additional index. */ + /* + * Per Intel's SDM, the 0x1f is a superset of 0xb, + * thus they can be handled by common code. + */ + case 0x1f: case 0xb: { int i, level_type; -- cgit v1.2.3 From c1a9acbc5295e278d788e9f7510f543bc9864fa2 Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Thu, 6 Jun 2019 00:17:39 +0300 Subject: kvm: vmx: fix limit checking in get_vmx_mem_address() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel SDM vol. 3, 5.3: The processor causes a general-protection exception (or, if the segment is SS, a stack-fault exception) any time an attempt is made to access the following addresses in a segment: - A byte at an offset greater than the effective limit - A word at an offset greater than the (effective-limit – 1) - A doubleword at an offset greater than the (effective-limit – 3) - A quadword at an offset greater than the (effective-limit – 7) Therefore, the generic limit checking error condition must be exn = (off > limit + 1 - access_len) = (off + access_len - 1 > limit) but not exn = (off + access_len > limit) as for now. Also avoid integer overflow of `off` at 32-bit KVM by casting it to u64. Note: access length is currently sizeof(u64) which is incorrect. This will be fixed in the subsequent patch. Signed-off-by: Eugene Korenevsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0f705c7d590c..018790c1b47f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4115,7 +4115,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, */ if (!(s.base == 0 && s.limit == 0xffffffff && ((s.type & 8) || !(s.type & 4)))) - exn = exn || (off + sizeof(u64) > s.limit); + exn = exn || ((u64)off + sizeof(u64) - 1 > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, -- cgit v1.2.3 From fdb28619a8f033c13f5d9b9e8b5536bb6e68a2c3 Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Thu, 6 Jun 2019 00:19:16 +0300 Subject: kvm: vmx: segment limit check: use access length There is an imperfection in get_vmx_mem_address(): access length is ignored when checking the limit. To fix this, pass access length as a function argument. The access length is usually obvious since it is used by callers after get_vmx_mem_address() call, but for vmread/vmwrite it depends on the state of 64-bit mode. Signed-off-by: Eugene Korenevsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 28 ++++++++++++++++------------ arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c | 3 ++- 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 018790c1b47f..c92349e2f621 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4008,7 +4008,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * #UD or #GP. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret) + u32 vmx_instruction_info, bool wr, int len, gva_t *ret) { gva_t off; bool exn; @@ -4115,7 +4115,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, */ if (!(s.base == 0 && s.limit == 0xffffffff && ((s.type & 8) || !(s.type & 4)))) - exn = exn || ((u64)off + sizeof(u64) - 1 > s.limit); + exn = exn || ((u64)off + len - 1 > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, @@ -4134,7 +4134,8 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) struct x86_exception e; if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + vmcs_read32(VMX_INSTRUCTION_INFO), false, + sizeof(*vmpointer), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { @@ -4386,6 +4387,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u64 field_value; unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + int len; gva_t gva = 0; struct vmcs12 *vmcs12; @@ -4423,12 +4425,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu) kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), field_value); } else { + len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &gva)) + vmx_instruction_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, - (is_long_mode(vcpu) ? 8 : 4), NULL); + kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); } return nested_vmx_succeed(vcpu); @@ -4438,6 +4440,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) static int handle_vmwrite(struct kvm_vcpu *vcpu) { unsigned long field; + int len; gva_t gva; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4463,11 +4466,11 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); else { + len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, len, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, - (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { + if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } @@ -4615,7 +4618,8 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) return 1; - if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, + true, sizeof(gpa_t), &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, @@ -4661,7 +4665,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); @@ -4723,7 +4727,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index e847ff1019a2..29d205bb4e4f 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -21,7 +21,7 @@ void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret); + u32 vmx_instruction_info, bool wr, int len, gva_t *ret); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f91323d527be..cccf73a91e88 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5345,7 +5345,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) * is read even if it isn't needed (e.g., for type==all) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, + sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { -- cgit v1.2.3 From 2d5ba19bdfef4dd06add144eb04287ee98409f75 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 3 Jun 2019 19:52:44 -0300 Subject: kvm: x86: add host poll control msrs Add an MSRs which allows the guest to disable host polling (specifically the cpuidle-haltpoll, when performing polling in the guest, disables host side polling). Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/msr.txt | 9 +++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm_para.h | 2 ++ arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 23 +++++++++++++++++++++++ 6 files changed, 39 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index f3f0d57ced8e..df1f4338b3ca 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -273,3 +273,12 @@ MSR_KVM_EOI_EN: 0x4b564d04 guest must both read the least significant bit in the memory area and clear it using a single CPU instruction, such as test and clear, or compare and exchange. + +MSR_KVM_POLL_CONTROL: 0x4b564d05 + Control host-side polling. + + data: Bit 0 enables (1) or disables (0) host-side HLT polling logic. + + KVM guests can request the host not to poll on HLT, for example if + they are performing polling themselves. + diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aeadbc770eb2..a86026969b19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -755,6 +755,8 @@ struct kvm_vcpu_arch { struct gfn_to_hva_cache data; } pv_eoi; + u64 msr_kvm_poll_control; + /* * Indicate whether the access faults on its page table in guest * which is set when fix page fault and used to detect unhandeable diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 19980ec1a316..21d5f0240595 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -29,6 +29,7 @@ #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 #define KVM_FEATURE_PV_SEND_IPI 11 +#define KVM_FEATURE_POLL_CONTROL 12 #define KVM_HINTS_REALTIME 0 @@ -47,6 +48,7 @@ #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 #define MSR_KVM_PV_EOI_EN 0x4b564d04 +#define MSR_KVM_POLL_CONTROL 0x4b564d05 struct kvm_steal_time { __u64 steal; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fc042419e670..840e12583b85 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -41,6 +41,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_NO_POLL select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9872f86fdbe9..68c74e948e28 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -658,7 +658,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | - (1 << KVM_FEATURE_PV_SEND_IPI); + (1 << KVM_FEATURE_PV_SEND_IPI) | + (1 << KVM_FEATURE_POLL_CONTROL); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eadd987ae350..eb87d71ec14a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1177,6 +1177,7 @@ static u32 emulated_msrs[] = { MSR_IA32_POWER_CTL, MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, }; static unsigned num_emulated_msrs; @@ -2636,6 +2637,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; + case MSR_KVM_POLL_CONTROL: + /* only enable bit supported */ + if (data & (-1ULL << 1)) + return 1; + + vcpu->arch.msr_kvm_poll_control = data; + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: @@ -2885,6 +2894,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_PV_EOI_EN: msr_info->data = vcpu->arch.pv_eoi.msr_val; break; + case MSR_KVM_POLL_CONTROL: + msr_info->data = vcpu->arch.msr_kvm_poll_control; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -8861,6 +8873,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + + /* poll control enabled by default */ + vcpu->arch.msr_kvm_poll_control = 1; + mutex_unlock(&vcpu->mutex); if (!kvmclock_periodic_sync) @@ -9972,6 +9988,13 @@ bool kvm_vector_hashing_enabled(void) } EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; +} +EXPORT_SYMBOL_GPL(kvm_arch_no_poll); + + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -- cgit v1.2.3 From 7d2296bfa52cc9b63063935dd21ca5fa0d790754 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 31 Mar 2019 19:17:21 -0700 Subject: kvm: x86: check kvm_apic_sw_enabled() is enough On delivering irq to apic, we iterate on vcpu and do the check like this: kvm_apic_present(vcpu) kvm_lapic_enabled(vpu) kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic) Since we have already checked kvm_apic_present(), it is reasonable to replace kvm_lapic_enabled() with kvm_apic_sw_enabled(). Signed-off-by: Wei Yang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 3cc3b2d130a0..188beb301cc5 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -86,7 +86,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_lapic_enabled(vcpu)) { + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { if (!kvm_vector_hashing_enabled()) { if (!lowest) lowest = vcpu; -- cgit v1.2.3 From ee171d2f39d60c6405dc21a0aaa766cad2c9c43e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 31 Mar 2019 19:17:22 -0700 Subject: kvm: x86: use same convention to name kvm_lapic_{set,clear}_vector() apic_clear_vector() is the counterpart of kvm_lapic_set_vector(), while they have different naming convention. Rename it and move together to arch/x86/kvm/lapic.h. Also fix one typo in comment by hand. Signed-off-by: Wei Yang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 +++++++---------- arch/x86/kvm/lapic.h | 5 +++++ 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fcf42a340790..9a54fccd0aa2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -87,11 +87,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } -static inline void apic_clear_vector(int vec, void *bitmap) -{ - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int __apic_test_and_set_vector(int vec, void *bitmap) { return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -445,12 +440,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ - apic_clear_vector(vec, apic->regs + APIC_IRR); + kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; - apic_clear_vector(vec, apic->regs + APIC_IRR); + kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); if (apic_search_irr(apic) != -1) apic->irr_pending = true; } @@ -1055,9 +1050,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) - kvm_lapic_set_vector(vector, apic->regs + APIC_TMR); + kvm_lapic_set_vector(vector, + apic->regs + APIC_TMR); else - apic_clear_vector(vector, apic->regs + APIC_TMR); + kvm_lapic_clear_vector(vector, + apic->regs + APIC_TMR); } if (vcpu->arch.apicv_active) @@ -2333,7 +2330,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) /* * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC satet has changed. + * thinking that APIC state has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f974a3d5a44d..36747174e4a8 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -130,6 +130,11 @@ void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) +static inline void kvm_lapic_clear_vector(int vec, void *bitmap) +{ + clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline void kvm_lapic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -- cgit v1.2.3 From 4cb8b1163586addc9e1f4eebe232a11534c24d4a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 31 Mar 2019 19:17:23 -0700 Subject: kvm: x86: offset is ensure to be in range In function apic_mmio_write(), the offset has been checked in: * apic_mmio_in_range() * offset & 0xf These two ensures offset is in range [0x010, 0xff0]. Signed-off-by: Wei Yang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9a54fccd0aa2..e82a18ccfc1a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2023,7 +2023,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, apic_debug("%s: offset 0x%x with length 0x%x, and value is " "0x%x\n", __func__, offset, len, val); - kvm_lapic_reg_write(apic, offset & 0xff0, val); + kvm_lapic_reg_write(apic, offset, val); return 0; } -- cgit v1.2.3 From 73f624f47c495d7129abef4b7031ed371cc7abb6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 Jun 2019 14:32:59 +0200 Subject: KVM: x86: move MSR_IA32_POWER_CTL handling to common code Make it available to AMD hosts as well, just in case someone is trying to use an Intel processor's CPUID setup. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 6 ------ arch/x86/kvm/vmx/vmx.h | 2 -- arch/x86/kvm/x86.c | 6 ++++++ 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a86026969b19..35e7937cc9ac 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -689,6 +689,7 @@ struct kvm_vcpu_arch { u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; + u64 msr_ia32_power_ctl; u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cccf73a91e88..5d903f8909d1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1695,9 +1695,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; - case MSR_IA32_POWER_CTL: - msr_info->data = vmx->msr_ia32_power_ctl; - break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1828,9 +1825,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_POWER_CTL: - vmx->msr_ia32_power_ctl = data; - break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 61128b48c503..1cdaa5af8245 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -260,8 +260,6 @@ struct vcpu_vmx { unsigned long host_debugctlmsr; - u64 msr_ia32_power_ctl; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eb87d71ec14a..dcba699e2d46 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2563,6 +2563,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_POWER_CTL: + vcpu->arch.msr_ia32_power_ctl = data; + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2822,6 +2825,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_POWER_CTL: + msr_info->data = vcpu->arch.msr_ia32_power_ctl; + break; case MSR_IA32_TSC: msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; break; -- cgit v1.2.3 From beb8d93b3e423043e079ef3dda19dad7b28467a8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2019 22:50:55 -0700 Subject: KVM: VMX: Fix handling of #MC that occurs during VM-Entry A previous fix to prevent KVM from consuming stale VMCS state after a failed VM-Entry inadvertantly blocked KVM's handling of machine checks that occur during VM-Entry. Per Intel's SDM, a #MC during VM-Entry is handled in one of three ways, depending on when the #MC is recognoized. As it pertains to this bug fix, the third case explicitly states EXIT_REASON_MCE_DURING_VMENTRY is handled like any other VM-Exit during VM-Entry, i.e. sets bit 31 to indicate the VM-Entry failed. If a machine-check event occurs during a VM entry, one of the following occurs: - The machine-check event is handled as if it occurred before the VM entry: ... - The machine-check event is handled after VM entry completes: ... - A VM-entry failure occurs as described in Section 26.7. The basic exit reason is 41, for "VM-entry failure due to machine-check event". Explicitly handle EXIT_REASON_MCE_DURING_VMENTRY as a one-off case in vmx_vcpu_run() instead of binning it into vmx_complete_atomic_exit(). Doing so allows vmx_vcpu_run() to handle VMX_EXIT_REASONS_FAILED_VMENTRY in a sane fashion and also simplifies vmx_complete_atomic_exit() since VMCS.VM_EXIT_INTR_INFO is guaranteed to be fresh. Fixes: b060ca3b2e9e7 ("kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5d903f8909d1..1b3ca0582a0c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6107,28 +6107,21 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info = 0; - u16 basic_exit_reason = (u16)vmx->exit_reason; - - if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) + if (vmx->exit_reason != EXIT_REASON_EXCEPTION_NMI) return; - if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmx->exit_intr_info = exit_intr_info; + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* if exit due to PF check for async PF */ - if (is_page_fault(exit_intr_info)) + if (is_page_fault(vmx->exit_intr_info)) vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || - is_machine_check(exit_intr_info)) + if (is_machine_check(vmx->exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(exit_intr_info)) { + if (is_nmi(vmx->exit_intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@ -6535,6 +6528,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = 0; vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + kvm_machine_check(); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) return; -- cgit v1.2.3 From 2ea72039808d50c909c2eb00eaebfaaaa743927a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 Jun 2019 14:57:25 +0200 Subject: kvm: nVMX: small cleanup in handle_exception The reason for skipping handling of NMI and #MC in handle_exception is the same, namely they are handled earlier by vmx_complete_atomic_exit. Calling the machine check handler (which just returns 1) is misleading, don't do it. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1b3ca0582a0c..da6c829bad9f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4455,11 +4455,8 @@ static int handle_exception(struct kvm_vcpu *vcpu) vect_info = vmx->idt_vectoring_info; intr_info = vmx->exit_intr_info; - if (is_machine_check(intr_info)) - return handle_machine_check(vcpu); - - if (is_nmi(intr_info)) - return 1; /* already handled by vmx_vcpu_run() */ + if (is_machine_check(intr_info) || is_nmi(intr_info)) + return 1; /* already handled by vmx_complete_atomic_exit */ if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); -- cgit v1.2.3 From 49def500e5eca40c73da9bad9799e9fab885996f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2019 22:50:56 -0700 Subject: KVM: VMX: Read cached VM-Exit reason to detect external interrupt Generic x86 code invokes the kvm_x86_ops external interrupt handler on all VM-Exits regardless of the actual exit type. Use the already-cached EXIT_REASON to determine if the VM-Exit was due to an interrupt, thus avoiding an extra VMREAD (to query VM_EXIT_INTR_INFO) for all other types of VM-Exit. In addition to avoiding the extra VMREAD, checking the EXIT_REASON instead of VM_EXIT_INTR_INFO makes it more obvious that vmx_handle_external_intr() is called for all VM-Exits, e.g. someone unfamiliar with the flow might wonder under what condition(s) VM_EXIT_INTR_INFO does not contain a valid interrupt, which is simply not possible since KVM always runs with "ack interrupt on exit". WARN once if VM_EXIT_INTR_INFO doesn't contain a valid interrupt on an EXTERNAL_INTERRUPT VM-Exit, as such a condition would indicate a hardware bug. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs.h | 6 +++++ arch/x86/kvm/vmx/vmx.c | 62 ++++++++++++++++++++++++++----------------------- 2 files changed, 39 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index cb6079f8a227..971a46c69df4 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -115,6 +115,12 @@ static inline bool is_nmi(u32 intr_info) == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); } +static inline bool is_external_intr(u32 intr_info) +{ + return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR); +} + enum vmcs_field_width { VMCS_FIELD_WIDTH_U16 = 0, VMCS_FIELD_WIDTH_U64 = 1, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da6c829bad9f..b541fe2c6347 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6127,42 +6127,46 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) - == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { - unsigned int vector; - unsigned long entry; - gate_desc *desc; - struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int vector; + unsigned long entry; #ifdef CONFIG_X86_64 - unsigned long tmp; + unsigned long tmp; #endif + gate_desc *desc; + u32 intr_info; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(desc); - asm volatile( + if (to_vmx(vcpu)->exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) + return; + + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (WARN_ONCE(!is_external_intr(intr_info), + "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + return; + + vector = intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)vmx->host_idt_base + vector; + entry = gate_offset(desc); + + asm volatile( #ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" - "push %[sp]\n\t" + "mov %%" _ASM_SP ", %[sp]\n\t" + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" + "push $%c[ss]\n\t" + "push %[sp]\n\t" #endif - "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" - CALL_NOSPEC - : + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" + CALL_NOSPEC + : #ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), + [sp]"=&r"(tmp), #endif - ASM_CALL_CONSTRAINT - : - THUNK_TARGET(entry), - [ss]"i"(__KERNEL_DS), - [cs]"i"(__KERNEL_CS) - ); - } + ASM_CALL_CONSTRAINT + : + THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); } STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); -- cgit v1.2.3 From 2342080cd6752fd40958f5a2aee0fb496ae92dce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2019 22:50:57 -0700 Subject: KVM: VMX: Store the host kernel's IDT base in a global variable Although the kernel may use multiple IDTs, KVM should only ever see the "real" IDT, e.g. the early init IDT is long gone by the time KVM runs and the debug stack IDT is only used for small windows of time in very specific flows. Before commit a547c6db4d2f1 ("KVM: VMX: Enable acknowledge interupt on vmexit"), the kernel's IDT base was consumed by KVM only when setting constant VMCS state, i.e. to set VMCS.HOST_IDTR_BASE. Because constant host state is done once per vCPU, there was ostensibly no need to cache the kernel's IDT base. When support for "ack interrupt on exit" was introduced, KVM added a second consumer of the IDT base as handling already-acked interrupts requires directly calling the interrupt handler, i.e. KVM uses the IDT base to find the address of the handler. Because interrupts are a fast path, KVM cached the IDT base to avoid having to VMREAD HOST_IDTR_BASE. Presumably, the IDT base was cached on a per-vCPU basis simply because the existing code grabbed the IDT base on a per-vCPU (VMCS) basis. Note, all post-boot IDTs use the same handlers for external interrupts, i.e. the "ack interrupt on exit" use of the IDT base would be unaffected even if the cached IDT somehow did not match the current IDT. And as for the original use case of setting VMCS.HOST_IDTR_BASE, if any of the above analysis is wrong then KVM has had a bug since the beginning of time since KVM has effectively been caching the IDT at vCPU creation since commit a8b732ca01c ("[PATCH] kvm: userspace interface"). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 12 +++++++----- arch/x86/kvm/vmx/vmx.h | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b541fe2c6347..c90abf33b509 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -392,6 +392,7 @@ static const struct kvm_vmx_segment_field { }; u64 host_efer; +static unsigned long host_idt_base; /* * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm @@ -3728,7 +3729,6 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; - struct desc_ptr dt; unsigned long cr0, cr3, cr4; cr0 = read_cr0(); @@ -3764,9 +3764,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; + vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ @@ -6144,7 +6142,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) return; vector = intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; + desc = (gate_desc *)host_idt_base + vector; entry = gate_offset(desc); asm volatile( @@ -7429,10 +7427,14 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) static __init int hardware_setup(void) { unsigned long host_bndcfgs; + struct desc_ptr dt; int r, i; rdmsrl_safe(MSR_EFER, &host_efer); + store_idt(&dt); + host_idt_base = dt.address; + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1cdaa5af8245..decd31055da8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -187,7 +187,6 @@ struct vcpu_vmx { int nmsrs; int save_nmsrs; bool guest_msrs_dirty; - unsigned long host_idt_base; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; -- cgit v1.2.3 From 165072b089e5af32c2693ab900d5fb5d41e3f293 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2019 22:50:58 -0700 Subject: KVM: x86: Move kvm_{before,after}_interrupt() calls to vendor code VMX can conditionally call kvm_{before,after}_interrupt() since KVM always uses "ack interrupt on exit" and therefore explicitly handles interrupts as opposed to blindly enabling irqs. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 2 -- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 302cb409d452..acc09e9fc173 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6174,6 +6174,7 @@ out: static void svm_handle_external_intr(struct kvm_vcpu *vcpu) { + kvm_before_interrupt(vcpu); local_irq_enable(); /* * We must have an instruction with interrupts enabled, so @@ -6181,6 +6182,7 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu) */ asm("nop"); local_irq_disable(); + kvm_after_interrupt(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c90abf33b509..963c8c409223 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6145,6 +6145,8 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) desc = (gate_desc *)host_idt_base + vector; entry = gate_offset(desc); + kvm_before_interrupt(vcpu); + asm volatile( #ifdef CONFIG_X86_64 "mov %%" _ASM_SP ", %[sp]\n\t" @@ -6165,6 +6167,8 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); + + kvm_after_interrupt(vcpu); } STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dcba699e2d46..9ff3a670d2b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7999,9 +7999,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_before_interrupt(vcpu); kvm_x86_ops->handle_external_intr(vcpu); - kvm_after_interrupt(vcpu); ++vcpu->stat.exits; -- cgit v1.2.3 From 95b5a48c4f2b7755702c2993f9986e4a45d85e45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2019 22:50:59 -0700 Subject: KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn Per commit 1b6269db3f833 ("KVM: VMX: Handle NMIs before enabling interrupts and preemption"), NMIs are handled directly in vmx_vcpu_run() to "make sure we handle NMI on the current cpu, and that we don't service maskable interrupts before non-maskable ones". The other exceptions handled by complete_atomic_exit(), e.g. async #PF and #MC, have similar requirements, and are located there to avoid extra VMREADs since VMX bins hardware exceptions and NMIs into a single exit reason. Clean up the code and eliminate the vaguely named complete_atomic_exit() by moving the interrupts-disabled exception and NMI handling into the existing handle_external_intrs() callback, and rename the callback to a more appropriate name. Rename VMexit handlers throughout so that the atomic and non-atomic counterparts have similar names. In addition to improving code readability, this also ensures the NMI handler is run with the host's debug registers loaded in the unlikely event that the user is debugging NMIs. Accuracy of the last_guest_tsc field is also improved when handling NMIs (and #MCs) as the handler will run after updating said field. Signed-off-by: Sean Christopherson [Naming cleanups. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 33 ++++++++++++++++++--------------- arch/x86/kvm/x86.c | 2 +- 4 files changed, 22 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 35e7937cc9ac..f46a12a5cf2e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1117,7 +1117,7 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - void (*handle_external_intr)(struct kvm_vcpu *vcpu); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index acc09e9fc173..bbc31f7213ed 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6172,7 +6172,7 @@ out: return ret; } -static void svm_handle_external_intr(struct kvm_vcpu *vcpu) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { kvm_before_interrupt(vcpu); local_irq_enable(); @@ -7268,7 +7268,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_tdp_cr3 = set_tdp_cr3, .check_intercept = svm_check_intercept, - .handle_external_intr = svm_handle_external_intr, + .handle_exit_irqoff = svm_handle_exit_irqoff, .request_immediate_exit = __kvm_request_immediate_exit, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 963c8c409223..2b182f58c126 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4437,11 +4437,11 @@ static void kvm_machine_check(void) static int handle_machine_check(struct kvm_vcpu *vcpu) { - /* already handled by vcpu_run */ + /* handled by vmx_vcpu_run() */ return 1; } -static int handle_exception(struct kvm_vcpu *vcpu) +static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -4454,7 +4454,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) intr_info = vmx->exit_intr_info; if (is_machine_check(intr_info) || is_nmi(intr_info)) - return 1; /* already handled by vmx_complete_atomic_exit */ + return 1; /* handled by handle_exception_nmi_irqoff() */ if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); @@ -5462,7 +5462,7 @@ static int handle_encls(struct kvm_vcpu *vcpu) * to be done to userspace and return 0. */ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { - [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, @@ -6100,11 +6100,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) +static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - if (vmx->exit_reason != EXIT_REASON_EXCEPTION_NMI) - return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* if exit due to PF check for async PF */ @@ -6123,7 +6120,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) } } -static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { unsigned int vector; unsigned long entry; @@ -6133,9 +6130,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) gate_desc *desc; u32 intr_info; - if (to_vmx(vcpu)->exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) - return; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) @@ -6170,7 +6164,17 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) kvm_after_interrupt(vcpu); } -STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); +STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); + +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu); + else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) + handle_exception_nmi_irqoff(vmx); +} static bool vmx_has_emulated_msr(int index) { @@ -6540,7 +6544,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } @@ -7694,7 +7697,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_tdp_cr3 = vmx_set_cr3, .check_intercept = vmx_check_intercept, - .handle_external_intr = vmx_handle_external_intr, + .handle_exit_irqoff = vmx_handle_exit_irqoff, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ff3a670d2b3..848ca0335b59 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7999,7 +7999,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_external_intr(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu); ++vcpu->stat.exits; -- cgit v1.2.3 From fadcead00c3e167255ac3016b6c3855cf639cca5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:23 -0700 Subject: KVM: nVMX: Intercept VMWRITEs to read-only shadow VMCS fields Allowing L1 to VMWRITE read-only fields is only beneficial in a double nesting scenario, e.g. no sane VMM will VMWRITE VM_EXIT_REASON in normal non-nested operation. Intercepting RO fields means KVM doesn't need to sync them from the shadow VMCS to vmcs12 when running L2. The obvious downside is that L1 will VM-Exit more often when running L3, but it's likely safe to assume most folks would happily sacrifice a bit of L3 performance, which may not even be noticeable in the grande scheme, to improve L2 performance across the board. Not intercepting fields tagged read-only also allows for additional optimizations, e.g. marking GUEST_{CS,SS}_AR_BYTES as SHADOW_FIELD_RO since those fields are rarely written by a VMMs, but read frequently. When utilizing a shadow VMCS with asymmetric R/W and R/O bitmaps, fields that cause VM-Exit on VMWRITE but not VMREAD need to be propagated to the shadow VMCS during VMWRITE emulation, otherwise a subsequence VMREAD from L1 will consume a stale value. Note, KVM currently utilizes asymmetric bitmaps when "VMWRITE any field" is not exposed to L1, but only so that it can reject the VMWRITE, i.e. propagating the VMWRITE to the shadow VMCS is a new requirement, not a bug fix. Eliminating the copying of RO fields reduces the latency of nested VM-Entry (copy_shadow_to_vmcs12()) by ~100 cycles (plus 40-50 cycles if/when the AR_BYTES fields are exposed RO). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 72 ++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c92349e2f621..0dc9505ae9a2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1105,14 +1105,6 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) vmx->nested.msrs.misc_low = data; vmx->nested.msrs.misc_high = data >> 32; - /* - * If L1 has read-only VM-exit information fields, use the - * less permissive vmx_vmwrite_bitmap to specify write - * permissions for the shadow VMCS. - */ - if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); - return 0; } @@ -1301,41 +1293,27 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) } /* - * Copy the writable VMCS shadow fields back to the VMCS12, in case - * they have been modified by the L1 guest. Note that the "read-only" - * VM-exit information fields are actually writable if the vCPU is - * configured to support "VMWRITE to any supported field in the VMCS." + * Copy the writable VMCS shadow fields back to the VMCS12, in case they have + * been modified by the L1 guest. Note, "writable" in this context means + * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of + * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" + * VM-exit information fields (which are actually writable if the vCPU is + * configured to support "VMWRITE to any supported field in the VMCS"). */ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { - const u16 *fields[] = { - shadow_read_write_fields, - shadow_read_only_fields - }; - const int max_fields[] = { - max_shadow_read_write_fields, - max_shadow_read_only_fields - }; - int i, q; - unsigned long field; - u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); + unsigned long field; + int i; preempt_disable(); vmcs_load(shadow_vmcs); - for (q = 0; q < ARRAY_SIZE(fields); q++) { - for (i = 0; i < max_fields[q]; i++) { - field = fields[q][i]; - field_value = __vmcs_readl(field); - vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); - } - /* - * Skip the VM-exit information fields if they are read-only. - */ - if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - break; + for (i = 0; i < max_shadow_read_write_fields; i++) { + field = shadow_read_write_fields[i]; + vmcs12_write_any(vmcs12, field, __vmcs_readl(field)); } vmcs_clear(shadow_vmcs); @@ -4517,6 +4495,24 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * path of prepare_vmcs02. */ break; + +#define SHADOW_FIELD_RO(x) case x: +#include "vmcs_shadow_fields.h" + /* + * L1 can read these fields without exiting, ensure the + * shadow VMCS is up-to-date. + */ + if (enable_shadow_vmcs) { + preempt_disable(); + vmcs_load(vmx->vmcs01.shadow_vmcs); + + __vmcs_writel(field, field_value); + + vmcs_clear(vmx->vmcs01.shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + preempt_enable(); + } + /* fall through */ default: vmx->nested.dirty_vmcs12 = true; break; @@ -5470,14 +5466,8 @@ error_guest_mode: void nested_vmx_vcpu_setup(void) { if (enable_shadow_vmcs) { - /* - * At vCPU creation, "VMWRITE to any supported field - * in the VMCS" is supported, so use the more - * permissive vmx_vmread_bitmap to specify both read - * and write permissions for the shadow VMCS. - */ vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } } -- cgit v1.2.3 From b643780562af5378ef7fe731c65b8f93e49c59c6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:24 -0700 Subject: KVM: nVMX: Intercept VMWRITEs to GUEST_{CS,SS}_AR_BYTES VMMs frequently read the guest's CS and SS AR bytes to detect 64-bit mode and CPL respectively, but effectively never write said fields once the VM is initialized. Intercepting VMWRITEs for the two fields saves ~55 cycles in copy_shadow_to_vmcs12(). Because some Intel CPUs, e.g. Haswell, drop the reserved bits of the guest access rights fields on VMWRITE, exposing the fields to L1 for VMREAD but not VMWRITE leads to inconsistent behavior between L1 and L2. On hardware that drops the bits, L1 will see the stripped down value due to reading the value from hardware, while L2 will see the full original value as stored by KVM. To avoid such an inconsistency, emulate the behavior on all CPUS, but only for intercepted VMWRITEs so as to avoid introducing pointless latency into copy_shadow_to_vmcs12(), e.g. if the emulation were added to vmcs12_write_any(). Since the AR_BYTES emulation is done only for intercepted VMWRITE, if a future patch (re)exposed AR_BYTES for both VMWRITE and VMREAD, then KVM would end up with incosistent behavior on pre-Haswell hardware, e.g. KVM would drop the reserved bits on intercepted VMWRITE, but direct VMWRITE to the shadow VMCS would not drop the bits. Add a WARN in the shadow field initialization to detect any attempt to expose an AR_BYTES field without updating vmcs12_write_any(). Note, emulation of the AR_BYTES reserved bit behavior is based on a patch[1] from Jim Mattson that applied the emulation to all writes to vmcs12 so that live migration across different generations of hardware would not introduce divergent behavior. But given that live migration of nested state has already been enabled, that ship has sailed (not to mention that no sane VMM will be affected by this behavior). [1] https://patchwork.kernel.org/patch/10483321/ Cc: Jim Mattson Cc: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 15 +++++++++++++++ arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0dc9505ae9a2..cd51ef68434e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -91,6 +91,10 @@ static void init_vmcs_shadow_fields(void) pr_err("Missing field from shadow_read_write_field %x\n", field + 1); + WARN_ONCE(field >= GUEST_ES_AR_BYTES && + field <= GUEST_TR_AR_BYTES, + "Update vmcs12_write_any() to expose AR_BYTES RW"); + /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist @@ -4477,6 +4481,17 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) vmcs12 = get_shadow_vmcs12(vcpu); } + /* + * Some Intel CPUs intentionally drop the reserved bits of the AR byte + * fields on VMWRITE. Emulate this behavior to ensure consistent KVM + * behavior regardless of the underlying hardware, e.g. if an AR_BYTE + * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD + * from L1 will return a different value than VMREAD from L2 (L1 sees + * the stripped down value, L2 sees the full value as stored by KVM). + */ + if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) + field_value &= 0x1f0ff; + if (vmcs12_write_any(vmcs12, field, field_value) < 0) return nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 132432f375c2..97dd5295be31 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -40,14 +40,14 @@ SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) +SHADOW_FIELD_RO(GUEST_CS_AR_BYTES) +SHADOW_FIELD_RO(GUEST_SS_AR_BYTES) SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) SHADOW_FIELD_RW(EXCEPTION_BITMAP) SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) SHADOW_FIELD_RW(TPR_THRESHOLD) -SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) -- cgit v1.2.3 From 1c6f0b47fb59d1674a6ba91b0ce58dfbfc5a76de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:25 -0700 Subject: KVM: nVMX: Track vmcs12 offsets for shadowed VMCS fields The vmcs12 fields offsets are constant and known at compile time. Store the associated offset for each shadowed field to avoid the costly lookup in vmcs_field_to_offset() when copying between vmcs12 and the shadow VMCS. Avoiding the costly lookup reduces the latency of copying by ~100 cycles in each direction. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 96 +++++++++++++++++++---------------- arch/x86/kvm/vmx/vmcs12.h | 57 +++++++-------------- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 74 +++++++++++++-------------- 3 files changed, 108 insertions(+), 119 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index cd51ef68434e..376fd9eabe42 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -41,15 +41,19 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) -static u16 shadow_read_only_fields[] = { -#define SHADOW_FIELD_RO(x) x, +struct shadow_vmcs_field { + u16 encoding; + u16 offset; +}; +static struct shadow_vmcs_field shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static u16 shadow_read_write_fields[] = { -#define SHADOW_FIELD_RW(x) x, +static struct shadow_vmcs_field shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_write_fields = @@ -63,37 +67,39 @@ static void init_vmcs_shadow_fields(void) memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); for (i = j = 0; i < max_shadow_read_only_fields; i++) { - u16 field = shadow_read_only_fields[i]; + struct shadow_vmcs_field entry = shadow_read_only_fields[i]; + u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_only_fields || - shadow_read_only_fields[i + 1] != field + 1)) + shadow_read_only_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_only_field %x\n", field + 1); clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 if (field & 1) +#ifdef CONFIG_X86_64 continue; +#else + entry.offset += sizeof(u32); #endif - if (j < i) - shadow_read_only_fields[j] = field; - j++; + shadow_read_only_fields[j++] = entry; } max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { - u16 field = shadow_read_write_fields[i]; + struct shadow_vmcs_field entry = shadow_read_write_fields[i]; + u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_write_fields || - shadow_read_write_fields[i + 1] != field + 1)) + shadow_read_write_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_write_field %x\n", field + 1); WARN_ONCE(field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES, - "Update vmcs12_write_any() to expose AR_BYTES RW"); + "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); /* * PML and the preemption timer can be emulated, but the @@ -119,13 +125,13 @@ static void init_vmcs_shadow_fields(void) clear_bit(field, vmx_vmwrite_bitmap); clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 if (field & 1) +#ifdef CONFIG_X86_64 continue; +#else + entry.offset += sizeof(u32); #endif - if (j < i) - shadow_read_write_fields[j] = field; - j++; + shadow_read_write_fields[j++] = entry; } max_shadow_read_write_fields = j; } @@ -1308,7 +1314,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); - unsigned long field; + struct shadow_vmcs_field field; + unsigned long val; int i; preempt_disable(); @@ -1317,7 +1324,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_shadow_read_write_fields; i++) { field = shadow_read_write_fields[i]; - vmcs12_write_any(vmcs12, field, __vmcs_readl(field)); + val = __vmcs_readl(field.encoding); + vmcs12_write_any(vmcs12, field.encoding, field.offset, val); } vmcs_clear(shadow_vmcs); @@ -1328,7 +1336,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const u16 *fields[] = { + const struct shadow_vmcs_field *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -1336,18 +1344,20 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) max_shadow_read_write_fields, max_shadow_read_only_fields }; - int i, q; - unsigned long field; - u64 field_value = 0; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); + struct shadow_vmcs_field field; + unsigned long val; + int i, q; vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); - __vmcs_writel(field, field_value); + val = vmcs12_read_any(vmcs12, field.encoding, + field.offset); + __vmcs_writel(field.encoding, val); } } @@ -2144,6 +2154,8 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); @@ -2240,23 +2252,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { prepare_vmcs02_full(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; } - /* - * First, the fields that are shadowed. This must be kept in sync - * with vmcs_shadow_fields.h. - */ - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - } - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); @@ -4372,6 +4373,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) int len; gva_t gva = 0; struct vmcs12 *vmcs12; + short offset; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4393,11 +4395,15 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vmcs12, field, &field_value) < 0) + + offset = vmcs_field_to_offset(field); + if (offset < 0) return nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* Read the field, zero-extended to a u64 field_value */ + field_value = vmcs12_read_any(vmcs12, field, offset); + /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending @@ -4437,6 +4443,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; struct vmcs12 *vmcs12; + short offset; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4481,6 +4488,11 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) vmcs12 = get_shadow_vmcs12(vcpu); } + offset = vmcs_field_to_offset(field); + if (offset < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* * Some Intel CPUs intentionally drop the reserved bits of the AR byte * fields on VMWRITE. Emulate this behavior to ensure consistent KVM @@ -4492,9 +4504,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) field_value &= 0x1f0ff; - if (vmcs12_write_any(vmcs12, field, field_value) < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + vmcs12_write_any(vmcs12, field, offset, field_value); /* * Do not track vmcs12 dirty-state if in guest-mode @@ -4502,7 +4512,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ if (!is_guest_mode(vcpu)) { switch (field) { -#define SHADOW_FIELD_RW(x) case x: +#define SHADOW_FIELD_RW(x, y) case x: #include "vmcs_shadow_fields.h" /* * The fields that can be updated by L1 without a vmexit are @@ -4511,7 +4521,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ break; -#define SHADOW_FIELD_RO(x) case x: +#define SHADOW_FIELD_RO(x, y) case x: #include "vmcs_shadow_fields.h" /* * L1 can read these fields without exiting, ensure the diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 3a742428ad17..9cd26099fcc0 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -394,69 +394,48 @@ static inline short vmcs_field_to_offset(unsigned long field) #undef ROL16 -/* - * Read a vmcs12 field. Since these can have varying lengths and we return - * one type, we chose the biggest type (u64) and zero-extend the return value - * to that size. Note that the caller, handle_vmread, might need to use only - * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of - * 64-bit fields are to be returned). - */ -static inline int vmcs12_read_any(struct vmcs12 *vmcs12, - unsigned long field, u64 *ret) +static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset) { - short offset = vmcs_field_to_offset(field); - char *p; - - if (offset < 0) - return offset; - - p = (char *)vmcs12 + offset; + char *p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 0; + return *((natural_width *)p); case VMCS_FIELD_WIDTH_U16: - *ret = *((u16 *)p); - return 0; + return *((u16 *)p); case VMCS_FIELD_WIDTH_U32: - *ret = *((u32 *)p); - return 0; + return *((u32 *)p); case VMCS_FIELD_WIDTH_U64: - *ret = *((u64 *)p); - return 0; + return *((u64 *)p); default: - WARN_ON(1); - return -ENOENT; + WARN_ON_ONCE(1); + return -1; } } -static inline int vmcs12_write_any(struct vmcs12 *vmcs12, - unsigned long field, u64 field_value){ - short offset = vmcs_field_to_offset(field); +static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset, u64 field_value) +{ char *p = (char *)vmcs12 + offset; - if (offset < 0) - return offset; - switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; - return 0; + break; case VMCS_FIELD_WIDTH_U32: *(u32 *)p = field_value; - return 0; + break; case VMCS_FIELD_WIDTH_U64: *(u64 *)p = field_value; - return 0; + break; case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *(natural_width *)p = field_value; - return 0; + break; default: - WARN_ON(1); - return -ENOENT; + WARN_ON_ONCE(1); + break; } - } #endif /* __KVM_X86_VMX_VMCS12_H */ diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 97dd5295be31..2cfa19ca158e 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -1,8 +1,8 @@ #ifndef SHADOW_FIELD_RO -#define SHADOW_FIELD_RO(x) +#define SHADOW_FIELD_RO(x, y) #endif #ifndef SHADOW_FIELD_RW -#define SHADOW_FIELD_RW(x) +#define SHADOW_FIELD_RW(x, y) #endif /* @@ -28,47 +28,47 @@ */ /* 16-bits */ -SHADOW_FIELD_RW(GUEST_INTR_STATUS) -SHADOW_FIELD_RW(GUEST_PML_INDEX) -SHADOW_FIELD_RW(HOST_FS_SELECTOR) -SHADOW_FIELD_RW(HOST_GS_SELECTOR) +SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status) +SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index) +SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector) +SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector) /* 32-bits */ -SHADOW_FIELD_RO(VM_EXIT_REASON) -SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) -SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) -SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) -SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) -SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) -SHADOW_FIELD_RO(GUEST_CS_AR_BYTES) -SHADOW_FIELD_RO(GUEST_SS_AR_BYTES) -SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) -SHADOW_FIELD_RW(EXCEPTION_BITMAP) -SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) -SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) -SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) -SHADOW_FIELD_RW(TPR_THRESHOLD) -SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) -SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) +SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code) +SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes) +SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control) +SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len) +SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value) /* Natural width */ -SHADOW_FIELD_RO(EXIT_QUALIFICATION) -SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) -SHADOW_FIELD_RW(GUEST_RIP) -SHADOW_FIELD_RW(GUEST_RSP) -SHADOW_FIELD_RW(GUEST_CR0) -SHADOW_FIELD_RW(GUEST_CR3) -SHADOW_FIELD_RW(GUEST_CR4) -SHADOW_FIELD_RW(GUEST_RFLAGS) -SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) -SHADOW_FIELD_RW(CR0_READ_SHADOW) -SHADOW_FIELD_RW(CR4_READ_SHADOW) -SHADOW_FIELD_RW(HOST_FS_BASE) -SHADOW_FIELD_RW(HOST_GS_BASE) +SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address) +SHADOW_FIELD_RW(GUEST_RIP, guest_rip) +SHADOW_FIELD_RW(GUEST_RSP, guest_rsp) +SHADOW_FIELD_RW(GUEST_CR0, guest_cr0) +SHADOW_FIELD_RW(GUEST_CR3, guest_cr3) +SHADOW_FIELD_RW(GUEST_CR4, guest_cr4) +SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask) +SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow) +SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow) +SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base) +SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base) /* 64-bit */ -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address) #undef SHADOW_FIELD_RO #undef SHADOW_FIELD_RW -- cgit v1.2.3 From f4f8316d2ad5f93ff0ec829a893db0728b9041bd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:26 -0700 Subject: KVM: nVMX: Lift sync_vmcs12() out of prepare_vmcs12() ... to make it more obvious that sync_vmcs12() is invoked on all nested VM-Exits, e.g. hiding sync_vmcs12() in prepare_vmcs12() makes it appear that guest state is NOT propagated to vmcs12 for a normal VM-Exit. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 376fd9eabe42..bdaf49d9260f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3500,11 +3500,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { - /* update guest state fields: */ - sync_vmcs12(vcpu, vmcs12); - /* update exit information fields: */ - vmcs12->vm_exit_reason = exit_reason; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -3865,9 +3861,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vcpu->arch.tsc_offset -= vmcs12->tsc_offset; if (likely(!vmx->fail)) { - if (exit_reason == -1) - sync_vmcs12(vcpu, vmcs12); - else + sync_vmcs12(vcpu, vmcs12); + + if (exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); -- cgit v1.2.3 From 3731905ef28fc1a9240d1532b2a9efbaea205dc0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:27 -0700 Subject: KVM: nVMX: Use descriptive names for VMCS sync functions and flags Nested virtualization involves copying data between many different types of VMCSes, e.g. vmcs02, vmcs12, shadow VMCS and eVMCS. Rename a variety of functions and flags to document both the source and destination of each sync. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 28 ++++++++++++++-------------- arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/vmx/vmx.h | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bdaf49d9260f..fc2b8f4cf45f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1615,7 +1615,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; * evmcs->host_idtr_base = vmcs12->host_idtr_base; * evmcs->host_rsp = vmcs12->host_rsp; - * sync_vmcs12() doesn't read these: + * sync_vmcs02_to_vmcs12() doesn't read these: * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; * evmcs->msr_bitmap = vmcs12->msr_bitmap; @@ -1839,7 +1839,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, return 1; } -void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) +void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1860,7 +1860,7 @@ void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) copy_vmcs12_to_shadow(vmx); } - vmx->nested.need_vmcs12_sync = false; + vmx->nested.need_vmcs12_to_shadow_sync = false; } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) @@ -3042,7 +3042,7 @@ vmentry_fail_vmexit: vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; return 1; } @@ -3382,7 +3382,7 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) * VM-entry controls is also updated, since this is really a guest * state bit.) */ -static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -3861,14 +3861,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vcpu->arch.tsc_offset -= vmcs12->tsc_offset; if (likely(!vmx->fail)) { - sync_vmcs12(vcpu, vmcs12); + sync_vmcs02_to_vmcs12(vcpu, vmcs12); if (exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); /* - * Must happen outside of sync_vmcs12() as it will + * Must happen outside of sync_vmcs02_to_vmcs12() as it will * also be used to capture vmcs12 cache as part of * capturing nVMX state for snapshot (migration). * @@ -3924,7 +3924,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -4284,7 +4284,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_sync = false; + vmx->nested.need_vmcs12_to_shadow_sync = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; @@ -4551,7 +4551,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; } @@ -5303,12 +5303,12 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, * When running L2, the authoritative vmcs12 state is in the * vmcs02. When running L1, the authoritative vmcs12 state is * in the shadow or enlightened vmcs linked to vmcs01, unless - * need_vmcs12_sync is set, in which case, the authoritative + * need_vmcs12_to_shadow_sync is set, in which case, the authoritative * vmcs12 state is in the vmcs12 already. */ if (is_guest_mode(vcpu)) { - sync_vmcs12(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_sync) { + sync_vmcs02_to_vmcs12(vcpu, vmcs12); + } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (vmx->nested.hv_evmcs) copy_enlightened_to_vmcs12(vmx); else if (enable_shadow_vmcs) @@ -5421,7 +5421,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * Sync eVMCS upon entry as we may not have * HV_X64_MSR_VP_ASSIST_PAGE set up yet. */ - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; } else { return -EINVAL; } diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 29d205bb4e4f..187d39bf0bf1 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -17,7 +17,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); -void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu); +void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2b182f58c126..1a87a91e98dc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6399,8 +6399,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_write32(PLE_WINDOW, vmx->ple_window); } - if (vmx->nested.need_vmcs12_sync) - nested_sync_from_vmcs12(vcpu); + if (vmx->nested.need_vmcs12_to_shadow_sync) + nested_sync_vmcs12_to_shadow(vcpu); if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index decd31055da8..f4448292df0f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -113,7 +113,7 @@ struct nested_vmx { * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. */ - bool need_vmcs12_sync; + bool need_vmcs12_to_shadow_sync; bool dirty_vmcs12; /* -- cgit v1.2.3 From e2174295b41d8c9115ab246a630d4630724a2cd5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:28 -0700 Subject: KVM: nVMX: Add helpers to identify shadowed VMCS fields So that future optimizations related to shadowed fields don't need to define their own switch statement. Add a BUILD_BUG_ON() to ensure at least one of the types (RW vs RO) is defined when including vmcs_shadow_fields.h (guess who keeps mistyping SHADOW_FIELD_RO as SHADOW_FIELD_R0). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 71 ++++++++++++++++++++--------------- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 ++ 2 files changed, 44 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fc2b8f4cf45f..0d9a239b9df9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4420,6 +4420,29 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } +static bool is_shadow_field_rw(unsigned long field) +{ + switch (field) { +#define SHADOW_FIELD_RW(x, y) case x: +#include "vmcs_shadow_fields.h" + return true; + default: + break; + } + return false; +} + +static bool is_shadow_field_ro(unsigned long field) +{ + switch (field) { +#define SHADOW_FIELD_RO(x, y) case x: +#include "vmcs_shadow_fields.h" + return true; + default: + break; + } + return false; +} static int handle_vmwrite(struct kvm_vcpu *vcpu) { @@ -4503,41 +4526,27 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) vmcs12_write_any(vmcs12, field, offset, field_value); /* - * Do not track vmcs12 dirty-state if in guest-mode - * as we actually dirty shadow vmcs12 instead of vmcs12. + * Do not track vmcs12 dirty-state if in guest-mode as we actually + * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated + * by L1 without a vmexit are always updated in the vmcs02, i.e. don't + * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. */ - if (!is_guest_mode(vcpu)) { - switch (field) { -#define SHADOW_FIELD_RW(x, y) case x: -#include "vmcs_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - -#define SHADOW_FIELD_RO(x, y) case x: -#include "vmcs_shadow_fields.h" - /* - * L1 can read these fields without exiting, ensure the - * shadow VMCS is up-to-date. - */ - if (enable_shadow_vmcs) { - preempt_disable(); - vmcs_load(vmx->vmcs01.shadow_vmcs); + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { + /* + * L1 can read these fields without exiting, ensure the + * shadow VMCS is up-to-date. + */ + if (enable_shadow_vmcs && is_shadow_field_ro(field)) { + preempt_disable(); + vmcs_load(vmx->vmcs01.shadow_vmcs); - __vmcs_writel(field, field_value); + __vmcs_writel(field, field_value); - vmcs_clear(vmx->vmcs01.shadow_vmcs); - vmcs_load(vmx->loaded_vmcs->vmcs); - preempt_enable(); - } - /* fall through */ - default: - vmx->nested.dirty_vmcs12 = true; - break; + vmcs_clear(vmx->vmcs01.shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + preempt_enable(); } + vmx->nested.dirty_vmcs12 = true; } return nested_vmx_succeed(vcpu); diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 2cfa19ca158e..4cea018ba285 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -1,3 +1,7 @@ +#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW) +BUILD_BUG_ON(1) +#endif + #ifndef SHADOW_FIELD_RO #define SHADOW_FIELD_RO(x, y) #endif -- cgit v1.2.3 From 7952d769c29caac4fb51c1f0f0c12434c805b127 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 08:36:29 -0700 Subject: KVM: nVMX: Sync rarely accessed guest fields only when needed Many guest fields are rarely read (or written) by VMMs, i.e. likely aren't accessed between runs of a nested VMCS. Delay pulling rarely accessed guest fields from vmcs02 until they are VMREAD or until vmcs12 is dirtied. The latter case is necessary because nested VM-Entry will consume all manner of fields when vmcs12 is dirty, e.g. for consistency checks. Note, an alternative to synchronizing all guest fields on VMREAD would be to read *only* the field being accessed, but switching VMCS pointers is expensive and odds are good if one guest field is being accessed then others will soon follow, or that vmcs12 will be dirtied due to a VMWRITE (see above). And the full synchronization results in slightly cleaner code. Note, although GUEST_PDPTRs are relevant only for a 32-bit PAE guest, they are accessed quite frequently for said guests, and a separate patch is in flight to optimize away GUEST_PDTPR synchronziation for non-PAE guests. Skipping rarely accessed guest fields reduces the latency of a nested VM-Exit by ~200 cycles. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 140 +++++++++++++++++++++++++++++++++++++++------- arch/x86/kvm/vmx/vmx.h | 7 +++ 2 files changed, 127 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0d9a239b9df9..879f14fa384e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3376,20 +3376,57 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; } -/* - * Update the guest state fields of vmcs12 to reflect changes that - * occurred while L2 was running. (The "IA-32e mode guest" bit of the - * VM-entry controls is also updated, since this is really a guest - * state bit.) - */ -static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static bool is_vmcs12_ext_field(unsigned long field) { - vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); - vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + switch (field) { + case GUEST_ES_SELECTOR: + case GUEST_CS_SELECTOR: + case GUEST_SS_SELECTOR: + case GUEST_DS_SELECTOR: + case GUEST_FS_SELECTOR: + case GUEST_GS_SELECTOR: + case GUEST_LDTR_SELECTOR: + case GUEST_TR_SELECTOR: + case GUEST_ES_LIMIT: + case GUEST_CS_LIMIT: + case GUEST_SS_LIMIT: + case GUEST_DS_LIMIT: + case GUEST_FS_LIMIT: + case GUEST_GS_LIMIT: + case GUEST_LDTR_LIMIT: + case GUEST_TR_LIMIT: + case GUEST_GDTR_LIMIT: + case GUEST_IDTR_LIMIT: + case GUEST_ES_AR_BYTES: + case GUEST_DS_AR_BYTES: + case GUEST_FS_AR_BYTES: + case GUEST_GS_AR_BYTES: + case GUEST_LDTR_AR_BYTES: + case GUEST_TR_AR_BYTES: + case GUEST_ES_BASE: + case GUEST_CS_BASE: + case GUEST_SS_BASE: + case GUEST_DS_BASE: + case GUEST_FS_BASE: + case GUEST_GS_BASE: + case GUEST_LDTR_BASE: + case GUEST_TR_BASE: + case GUEST_GDTR_BASE: + case GUEST_IDTR_BASE: + case GUEST_PENDING_DBG_EXCEPTIONS: + case GUEST_BNDCFGS: + return true; + default: + break; + } - vmcs12->guest_rsp = kvm_rsp_read(vcpu); - vmcs12->guest_rip = kvm_rip_read(vcpu); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + return false; +} + +static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); @@ -3410,8 +3447,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); @@ -3427,11 +3462,65 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (kvm_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; +} + +static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) + return; + + + WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->nested.vmcs02; + vmx_vcpu_load(&vmx->vcpu, cpu); + + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_load(&vmx->vcpu, cpu); + put_cpu(); +} + +/* + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) + */ +static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->nested.hv_evmcs) + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; + + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_rsp = kvm_rsp_read(vcpu); + vmcs12->guest_rip = kvm_rip_read(vcpu); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; else @@ -3481,8 +3570,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); } /* @@ -4280,6 +4367,8 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) if (vmx->nested.current_vmptr == -1ull) return; + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ @@ -4397,6 +4486,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + /* Read the field, zero-extended to a u64 field_value */ field_value = vmcs12_read_any(vmcs12, field, offset); @@ -4495,9 +4587,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - if (!is_guest_mode(vcpu)) + if (!is_guest_mode(vcpu)) { vmcs12 = get_vmcs12(vcpu); - else { + + /* + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties + * vmcs12, else we may crush a field or consume a stale value. + */ + if (!is_shadow_field_rw(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + } else { /* * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE * to shadowed-field sets the ALU flags for VMfailInvalid. @@ -5317,6 +5416,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, */ if (is_guest_mode(vcpu)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (vmx->nested.hv_evmcs) copy_enlightened_to_vmcs12(vmx); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f4448292df0f..f03af64e9934 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -109,6 +109,7 @@ struct nested_vmx { * to guest memory during VM exit. */ struct vmcs12 *cached_shadow_vmcs12; + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. @@ -116,6 +117,12 @@ struct nested_vmx { bool need_vmcs12_to_shadow_sync; bool dirty_vmcs12; + /* + * Indicates lazily loaded guest state has not yet been decached from + * vmcs02. + */ + bool need_sync_vmcs02_to_vmcs12_rare; + /* * vmcs02 has been initialized, i.e. state that is constant for * vmcs02 has been written to the backing VMCS. Initialization -- cgit v1.2.3 From b1346ab2afbe64e7fb4ebc45efe5dd69367c9308 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 Jun 2019 17:24:00 +0200 Subject: KVM: nVMX: Rename prepare_vmcs02_*_full to prepare_vmcs02_*_rare These function do not prepare the entire state of the vmcs02, only the rarely needed parts. Rename them to make this clearer. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 879f14fa384e..6e82bbca2fe1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1955,7 +1955,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmx_set_constant_host_state(vmx); } -static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, +static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { prepare_vmcs02_constant_state(vmx); @@ -1976,7 +1976,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) - prepare_vmcs02_early_full(vmx, vmcs12); + prepare_vmcs02_early_rare(vmx, vmcs12); /* * PIN CONTROLS @@ -2130,7 +2130,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } } -static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; @@ -2254,7 +2254,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { - prepare_vmcs02_full(vmx, vmcs12); + prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; } -- cgit v1.2.3 From d28f4290b53a157191ed9991ad05dffe9e8c0c89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:27 -0700 Subject: KVM: VMX: Always signal #GP on WRMSR to MSR_IA32_CR_PAT with bad value The behavior of WRMSR is in no way dependent on whether or not KVM consumes the value. Fixes: 4566654bb9be9 ("KVM: vmx: Inject #GP on invalid PAT CR") Cc: stable@vger.kernel.org Cc: Nadav Amit Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1a87a91e98dc..091610684d28 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1894,9 +1894,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) MSR_TYPE_W); break; case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_pat_valid(data)) - return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; -- cgit v1.2.3 From 3b013a2972d5bc344d6eaa8f24fdfe268211e45f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:28 -0700 Subject: KVM: nVMX: Always sync GUEST_BNDCFGS when it comes from vmcs01 If L1 does not set VM_ENTRY_LOAD_BNDCFGS, then L1's BNDCFGS value must be propagated to vmcs02 since KVM always runs with VM_ENTRY_LOAD_BNDCFGS when MPX is supported. Because the value effectively comes from vmcs01, vmcs02 must be updated even if vmcs12 is clean. Fixes: 62cf9bd8118c4 ("KVM: nVMX: Fix emulation of VM_ENTRY_LOAD_BNDCFGS") Cc: stable@vger.kernel.org Cc: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6e82bbca2fe1..c4c0a45245b2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2228,13 +2228,9 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - if (kvm_mpx_supported()) { - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - else - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); - } + if (kvm_mpx_supported() && vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } /* @@ -2266,6 +2262,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); } + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the -- cgit v1.2.3 From c538d57f6726022f9810bfbb52c4696cfdaf10a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:29 -0700 Subject: KVM: nVMX: Write ENCLS-exiting bitmap once per vmcs02 KVM doesn't yet support SGX virtualization, i.e. writes a constant value to ENCLS_EXITING_BITMAP so that it can intercept ENCLS and inject a #UD. Fixes: 0b665d3040281 ("KVM: vmx: Inject #UD for SGX ENCLS instruction in guest") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c4c0a45245b2..7b6628edd617 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1943,6 +1943,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (enable_pml) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -2065,9 +2068,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) vmcs_write64(APIC_ACCESS_ADDR, -1ull); - if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -- cgit v1.2.3 From 4d6c989284ca61176f60148ba3f3c70650ee1aff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:30 -0700 Subject: KVM: nVMX: Don't rewrite GUEST_PML_INDEX during nested VM-Entry Emulation of GUEST_PML_INDEX for a nested VMM is a bit weird. Because L0 flushes the PML on every VM-Exit, the value in vmcs02 at the time of VM-Enter is a constant -1, regardless of what L1 thinks/wants. Fixes: 09abe32002665 ("KVM: nVMX: split pieces of prepare_vmcs02() to prepare_vmcs02_early()") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7b6628edd617..a090bfb10796 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1940,8 +1940,17 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); - if (enable_pml) + /* + * The PML address never changes, so it is constant in vmcs02. + * Conceptually we want to copy the PML index from vmcs01 here, + * and then back to vmcs01 on nested vmexit. But since we flush + * the log and reset GUEST_PML_INDEX on each vmexit, the PML + * index is also effectively constant in vmcs02. + */ + if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); @@ -2101,16 +2110,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control |= VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_init(vmx, exec_control); - /* - * Conceptually we want to copy the PML address and index from - * vmcs01 here, and then back to vmcs01 on nested vmexit. But, - * since we always flush the log on each vmexit and never change - * the PML address (once set), this happens to be equivalent to - * simply resetting the index in vmcs02. - */ - if (enable_pml) - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - /* * Interrupt/Exception Fields */ -- cgit v1.2.3 From b464f57e133d8c751ca1fb4af039c808b873876b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 7 Jun 2019 19:00:14 +0200 Subject: KVM: VMX: simplify vmx_prepare_switch_to_{guest,host} vmx->loaded_cpu_state can only be NULL or equal to vmx->loaded_vmcs, so change it to a bool. Because the direction of the bool is now the opposite of vmx->guest_msrs_dirty, change the direction of vmx->guest_msrs_dirty so that they match. Finally, do not imply that MSRs have to be reloaded when vmx->guest_state_loaded is false; instead, set vmx->guest_msrs_ready to false explicitly in vmx_prepare_switch_to_host. Cc: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 26 +++++++++++++------------- arch/x86/kvm/vmx/vmx.h | 18 ++++++++++++------ 2 files changed, 25 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 091610684d28..02c54f684ce5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1057,20 +1057,18 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ - if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { - vmx->guest_msrs_dirty = false; + if (!vmx->guest_msrs_ready) { + vmx->guest_msrs_ready = true; for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, vmx->guest_msrs[i].mask); } - - if (vmx->loaded_cpu_state) + if (vmx->guest_state_loaded) return; - vmx->loaded_cpu_state = vmx->loaded_vmcs; - host_state = &vmx->loaded_cpu_state->host_state; + host_state = &vmx->loaded_vmcs->host_state; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not @@ -1126,20 +1124,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmcs_writel(HOST_GS_BASE, gs_base); host_state->gs_base = gs_base; } + + vmx->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->loaded_cpu_state) + if (!vmx->guest_state_loaded) return; - WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); - host_state = &vmx->loaded_cpu_state->host_state; + host_state = &vmx->loaded_vmcs->host_state; ++vmx->vcpu.stat.host_state_reload; - vmx->loaded_cpu_state = NULL; #ifdef CONFIG_X86_64 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -1165,13 +1163,15 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); + vmx->guest_state_loaded = false; + vmx->guest_msrs_ready = false; } #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->loaded_cpu_state) + if (vmx->guest_state_loaded) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; @@ -1180,7 +1180,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->loaded_cpu_state) + if (vmx->guest_state_loaded) wrmsrl(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; @@ -1583,7 +1583,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_dirty = true; + vmx->guest_msrs_ready = false; if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f03af64e9934..4c5c24f37c5f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -187,13 +187,23 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; u8 msr_bitmap_mode; + + /* + * If true, host state has been stored in vmx->loaded_vmcs for + * the CPU registers that only need to be switched when transitioning + * to/from the kernel, and the registers have been loaded with guest + * values. If false, host state is loaded in the CPU registers + * and vmx->loaded_vmcs->host_state is invalid. + */ + bool guest_state_loaded; + u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; - bool guest_msrs_dirty; + bool guest_msrs_ready; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -208,14 +218,10 @@ struct vcpu_vmx { /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. loaded_cpu_state points - * to the VMCS whose state is loaded into the CPU registers that only - * need to be switched when transitioning to/from the kernel; a NULL - * value indicates that host state is loaded. + * guest (L2), it points to a different VMCS. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; - struct loaded_vmcs *loaded_cpu_state; struct msr_autoload { struct vmx_msrs guest; -- cgit v1.2.3 From 13b964a29d66333c1957dbd51f2c9e138c546f7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:31 -0700 Subject: KVM: nVMX: Don't "put" vCPU or host state when switching VMCS When switching between vmcs01 and vmcs02, KVM isn't actually switching between guest and host. If guest state is already loaded (the likely, if not guaranteed, case), keep the guest state loaded and manually swap the loaded_cpu_state pointer after propagating saved host state to the new vmcs0{1,2}. Avoiding the switch between guest and host reduces the latency of switching between vmcs01 and vmcs02 by several hundred cycles, and reduces the roundtrip time of a nested VM by upwards of 1000 cycles. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 23 +++++++++++++++++++- arch/x86/kvm/vmx/vmx.c | 53 ++++++++++++++++++++++++++--------------------- arch/x86/kvm/vmx/vmx.h | 3 ++- 3 files changed, 53 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a090bfb10796..856ac90ce10a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -248,18 +248,39 @@ static void free_nested(struct kvm_vcpu *vcpu) free_loaded_vmcs(&vmx->nested.vmcs02); } +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; int cpu; if (vmx->loaded_vmcs == vmcs) return; cpu = get_cpu(); - vmx_vcpu_put(vcpu); + prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); + vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); vm_entry_controls_reset_shadow(vmx); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 02c54f684ce5..26b3d96ef5a5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1039,6 +1039,33 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) +{ + if (unlikely(fs_sel != host->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host->gs_sel = gs_sel; + } + if (unlikely(fs_base != host->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host->fs_base = fs_base; + } + if (unlikely(gs_base != host->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host->gs_base = gs_base; + } +} + void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1102,29 +1129,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - if (unlikely(fs_sel != host_state->fs_sel)) { - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else - vmcs_write16(HOST_FS_SELECTOR, 0); - host_state->fs_sel = fs_sel; - } - if (unlikely(gs_sel != host_state->gs_sel)) { - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else - vmcs_write16(HOST_GS_SELECTOR, 0); - host_state->gs_sel = gs_sel; - } - if (unlikely(fs_base != host_state->fs_base)) { - vmcs_writel(HOST_FS_BASE, fs_base); - host_state->fs_base = fs_base; - } - if (unlikely(gs_base != host_state->gs_base)) { - vmcs_writel(HOST_GS_BASE, gs_base); - host_state->gs_base = gs_base; - } - + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); vmx->guest_state_loaded = true; } @@ -1314,7 +1319,7 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) pi_set_sn(pi_desc); } -void vmx_vcpu_put(struct kvm_vcpu *vcpu) +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4c5c24f37c5f..0bd598c47aec 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -303,11 +303,12 @@ struct kvm_vmx { bool nested_vmx_allowed(struct kvm_vcpu *vcpu); void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void vmx_vcpu_put(struct kvm_vcpu *vcpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -- cgit v1.2.3 From 8ef863e67a89c72b5af893a1214f6c35a5f456f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:32 -0700 Subject: KVM: nVMX: Don't reread VMCS-agnostic state when switching VMCS When switching between vmcs01 and vmcs02, there is no need to update state tracking for values that aren't tied to any particular VMCS as the per-vCPU values are already up-to-date (vmx_switch_vmcs() can only be called when the vCPU is loaded). Avoiding the update eliminates a RDMSR, and potentially a RDPKRU and posted-interrupt update (cmpxchg64() and more). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 18 +++++++++++++----- arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 856ac90ce10a..691f84bff051 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -279,7 +279,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load(vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 26b3d96ef5a5..7ffcbb674b1c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1234,11 +1234,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) pi_set_on(pi_desc); } -/* - * Switches to specified vcpu, until a matching vcpu_put(), but assumes - * vcpu mutex is already taken. - */ -void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1299,8 +1295,20 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_has_tsc_control && vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) decache_tsc_multiplier(vmx); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); vmx->host_debugctlmsr = get_debugctlmsr(); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 0bd598c47aec..613f272cdcf8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -302,6 +302,7 @@ struct kvm_vmx { }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); -- cgit v1.2.3 From 73cb85568433feadb79e963bf2efba9b3e9ae3df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:26 -0700 Subject: KVM: nVMX: Don't dump VMCS if virtual APIC page can't be mapped ... as a malicious userspace can run a toy guest to generate invalid virtual-APIC page addresses in L1, i.e. flood the kernel log with error messages. Fixes: 690908104e39d ("KVM: nVMX: allow tests to use bad virtual-APIC page address") Cc: stable@vger.kernel.org Cc: Paolo Bonzini Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 691f84bff051..4ba0e7b60253 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2880,9 +2880,6 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) */ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_TPR_SHADOW); - } else { - printk("bad virtual-APIC page address\n"); - dump_vmcs(); } } -- cgit v1.2.3 From ca2f5466f854184f1753ecb526f0e21a353a25e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:33 -0700 Subject: KVM: nVMX: Don't speculatively write virtual-APIC page address The VIRTUAL_APIC_PAGE_ADDR in vmcs02 is guaranteed to be updated before it is consumed by hardware, either in nested_vmx_enter_non_root_mode() or via the KVM_REQ_GET_VMCS12_PAGES callback. Avoid an extra VMWRITE and only stuff a bad value into vmcs02 when mapping vmcs12's address fails. This also eliminates the need for extra comments to connect the dots between prepare_vmcs02_early() and nested_get_vmcs12_pages(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4ba0e7b60253..ded024738db4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2039,20 +2039,13 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; - /* - * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if - * nested_get_vmcs12_pages can't fix it up, the illegal value - * will result in a VM entry failure. - */ - if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); - } else { #ifdef CONFIG_X86_64 + else exec_control |= CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING; #endif - } /* * A vmexit (to either L1 hypervisor or L0 userspace) is always needed @@ -2861,10 +2854,6 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { map = &vmx->nested.virtual_apic_map; - /* - * If translation failed, VM entry will fail because - * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - */ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && @@ -2880,6 +2869,12 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) */ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_TPR_SHADOW); + } else { + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to + * force VM-Entry to fail. + */ + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); } } -- cgit v1.2.3 From a49700b66e3523947f0e19c761891cc7c510d9fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:34 -0700 Subject: KVM: nVMX: Don't speculatively write APIC-access page address If nested_get_vmcs12_pages() fails to map L1's APIC_ACCESS_ADDR into L2, then it disables SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES in vmcs02. In other words, the APIC_ACCESS_ADDR in vmcs02 is guaranteed to be written with the correct value before being consumed by hardware, drop the unneessary VMWRITE. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ded024738db4..fb702fa83e06 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2083,14 +2083,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); - /* - * Write an illegal value to APIC_ACCESS_ADDR. Later, - * nested_get_vmcs12_pages will either fix it up or - * remove the VM execution control. - */ - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) - vmcs_write64(APIC_ACCESS_ADDR, -1ull); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -- cgit v1.2.3 From 142e4be77bc629802599d7c94f413759bca1c185 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:35 -0700 Subject: KVM: nVMX: Update vmcs12 for MSR_IA32_CR_PAT when it's written As alluded to by the TODO comment, KVM unconditionally intercepts writes to the PAT MSR. In the unlikely event that L1 allows L2 to write L1's PAT directly but saves L2's PAT on VM-Exit, update vmcs12 when L2 writes the PAT. This eliminates the need to VMREAD the value from vmcs02 on VM-Exit as vmcs12 is already up to date in all situations. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ---- arch/x86/kvm/vmx/vmx.c | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fb702fa83e06..3290e332c25f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3564,10 +3564,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); } - /* TODO: These cannot have changed unless we have MSR bitmaps and - * the relevant bit asks not to trap the change */ - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7ffcbb674b1c..23dd23d7023f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1910,6 +1910,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!kvm_pat_valid(data)) return 1; + if (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) + get_vmcs12(vcpu)->guest_ia32_pat = data; + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; -- cgit v1.2.3 From de70d279709efb7d0b0c0dce2eb398fc43c4b7f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:36 -0700 Subject: KVM: nVMX: Update vmcs12 for SYSENTER MSRs when they're written For L2, KVM always intercepts WRMSR to SYSENTER MSRs. Update vmcs12 in the WRMSR handler so that they don't need to be (re)read from vmcs02 on every nested VM-Exit. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 ++++--- arch/x86/kvm/vmx/vmx.c | 6 ++++++ 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3290e332c25f..c096e803e6f0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3521,6 +3521,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -3566,9 +3570,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); } /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 23dd23d7023f..248fca1f0c45 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1831,12 +1831,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif case MSR_IA32_SYSENTER_CS: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_cs = data; vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_eip = data; vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_esp = data; vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: -- cgit v1.2.3 From 699a1ac214328569d203b9c6bfc22903d5642dee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:37 -0700 Subject: KVM: nVMX: Update vmcs12 for MSR_IA32_DEBUGCTLMSR when it's written KVM unconditionally intercepts WRMSR to MSR_IA32_DEBUGCTLMSR. In the unlikely event that L1 allows L2 to write L1's MSR_IA32_DEBUGCTLMSR, but but saves L2's value on VM-Exit, update vmcs12 during L2's WRMSR so as to eliminate the need to VMREAD the value from vmcs02 on nested VM-Exit. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +--- arch/x86/kvm/vmx/vmx.c | 8 ++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c096e803e6f0..ba7f16b37719 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3563,10 +3563,8 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - } if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 248fca1f0c45..6e414d698130 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1845,6 +1845,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) get_vmcs12(vcpu)->guest_sysenter_esp = data; vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_DEBUGCTLMSR: + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; + + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && -- cgit v1.2.3 From c27e5b0d139b8728abd6b087cb0d3b2eae6ab079 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:39 -0700 Subject: KVM: nVMX: Don't update GUEST_BNDCFGS if it's clean in HV eVMCS L1 is responsible for dirtying GUEST_GRP1 if it writes GUEST_BNDCFGS. Cc: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba7f16b37719..3363fe1e743b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2197,6 +2197,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } + + if (kvm_mpx_supported() && vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } if (nested_cpu_has_xsaves(vmcs12)) @@ -2232,10 +2236,6 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); set_cr4_guest_host_mask(vmx); - - if (kvm_mpx_supported() && vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } /* -- cgit v1.2.3 From bf03d4f9334728bf7c8ffc7de787df48abd6340e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 6 Jun 2019 18:52:44 +0200 Subject: KVM: x86: introduce is_pae_paging Checking for 32-bit PAE is quite common around code that fiddles with the PDPTRs. Add a function to compress all checks into a single invocation. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 8 ++++---- arch/x86/kvm/x86.h | 5 +++++ 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3363fe1e743b..524a13f91589 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -961,8 +961,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && - !nested_ept) { + if (is_pae_paging(vcpu) && !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6e414d698130..a74936a5cf1a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2767,7 +2767,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty)) return; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); @@ -2779,7 +2779,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (is_pae_paging(vcpu)) { mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 848ca0335b59..e536a2b2b0e8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -719,7 +719,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) + if (!is_pae_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -962,8 +962,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu) && (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) return 1; - else if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + else if (is_pae_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); @@ -8596,7 +8596,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { + if (is_pae_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 275b3b646023..e08a12892e8b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -139,6 +139,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } +static inline bool is_pae_paging(struct kvm_vcpu *vcpu) +{ + return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); +} + static inline u32 bit(int bitno) { return 1 << (bitno & 31); -- cgit v1.2.3 From c7554efc83355150c91d8097f26a3c99d58ad53d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 09:06:40 -0700 Subject: KVM: nVMX: Copy PDPTRs to/from vmcs12 only when necessary Per Intel's SDM: ... the logical processor uses PAE paging if CR0.PG=1, CR4.PAE=1 and IA32_EFER.LME=0. A VM entry to a guest that uses PAE paging loads the PDPTEs into internal, non-architectural registers based on the setting of the "enable EPT" VM-execution control. and: [GUEST_PDPTR] values are saved into the four PDPTE fields as follows: - If the "enable EPT" VM-execution control is 0 or the logical processor was not using PAE paging at the time of the VM exit, the values saved are undefined. In other words, if EPT is disabled or the guest isn't using PAE paging, then the PDPTRS aren't consumed by hardware on VM-Entry and are loaded with junk on VM-Exit. From a nesting perspective, all of the above hold true, i.e. KVM can effectively ignore the VMCS PDPTRs. E.g. KVM already loads the PDPTRs from memory when nested EPT is disabled (see nested_vmx_load_cr3()). Because KVM intercepts setting CR4.PAE, there is no danger of consuming a stale value or crushing L1's VMWRITEs regardless of whether L1 intercepts CR4.PAE. The vmcs12's values are unchanged up until the VM-Exit where L2 sets CR4.PAE, i.e. L0 will see the new PAE state on the subsequent VM-Entry and propagate the PDPTRs from vmcs12 to vmcs02. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 524a13f91589..3cb257b4d8a2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2252,10 +2252,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { + if (vmx->nested.dirty_vmcs12 || hv_evmcs) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; + + load_guest_pdptrs_vmcs12 = !hv_evmcs || + !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && @@ -2358,6 +2364,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, entry_failure_code)) return -EINVAL; + /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ + if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && + is_pae_paging(vcpu)) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -3547,10 +3562,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ if (enable_ept) { vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); - vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); - vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); - vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); - vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } } vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); -- cgit v1.2.3 From c075c3e49d7ae3599106f1af53352268030469db Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:53 -0700 Subject: KVM: nVMX: Use adjusted pin controls for vmcs02 KVM provides a module parameter to allow disabling virtual NMI support to simplify testing (hardware *without* virtual NMI support is hard to come by but it does have users). When preparing vmcs02, use the accessor for pin controls to ensure that the module param is respected for nested guests. Opportunistically swap the order of applying L0's and L1's pin controls to better align with other controls and to prepare for a future patche that will ignore L1's, but not L0's, preemption timer flag. Fixes: d02fcf50779ec ("kvm: vmx: Allow disabling virtual NMI support") Cc: Paolo Bonzini Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 ++--- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3cb257b4d8a2..cd33f3109f24 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2013,10 +2013,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * PIN CONTROLS */ - exec_control = vmcs12->pin_based_vm_exec_control; - + exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control |= vmcs12->pin_based_vm_exec_control; /* Preemption timer setting is computed directly in vmx_vcpu_run. */ - exec_control |= vmcs_config.pin_based_exec_ctrl; exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; vmx->loaded_vmcs->hv_timer_armed = false; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a74936a5cf1a..0e722fa5e189 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3825,7 +3825,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } -static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 613f272cdcf8..7f67f327204a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -480,6 +480,7 @@ static inline u32 vmx_vmexit_ctrl(void) } u32 vmx_exec_control(struct vcpu_vmx *vmx); +u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { -- cgit v1.2.3 From 70f932ecdfe6b593ef6784d55d2c096aafac1510 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:54 -0700 Subject: KVM: VMX: Add builder macros for shadowing controls ... to pave the way for shadowing all (five) major VMCS control fields without massive amounts of error prone copy+paste+modify. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 100 ++++++++++++++++++------------------------------- 1 file changed, 36 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 7f67f327204a..db4f9289d5da 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -85,6 +85,11 @@ struct pt_desc { struct pt_ctx guest; }; +struct vmx_controls_shadow { + u32 vm_entry; + u32 vm_exit; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -200,6 +205,9 @@ struct vcpu_vmx { u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; + + struct vmx_controls_shadow controls_shadow; + struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; @@ -211,8 +219,6 @@ struct vcpu_vmx { u64 spec_ctrl; - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; u32 secondary_exec_control; /* @@ -388,69 +394,35 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); -} - -static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_ENTRY_CONTROLS, val); - vmx->vm_entry_controls_shadow = val; -} - -static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_entry_controls_shadow != val) - vm_entry_controls_init(vmx, val); -} - -static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_entry_controls_shadow; -} - -static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); -} - -static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); -} - -static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); -} - -static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_EXIT_CONTROLS, val); - vmx->vm_exit_controls_shadow = val; -} - -static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_exit_controls_shadow != val) - vm_exit_controls_init(vmx, val); -} - -static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_exit_controls_shadow; -} - -static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); -} - -static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +#define BUILD_CONTROLS_SHADOW(lname, uname) \ +static inline void lname##_controls_reset_shadow(struct vcpu_vmx *vmx) \ +{ \ + vmx->controls_shadow.lname = vmcs_read32(uname); \ +} \ +static inline void lname##_controls_init(struct vcpu_vmx *vmx, u32 val) \ +{ \ + vmcs_write32(uname, val); \ + vmx->controls_shadow.lname = val; \ +} \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ +{ \ + if (vmx->controls_shadow.lname != val) \ + lname##_controls_init(vmx, val); \ +} \ +static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return vmx->controls_shadow.lname; \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { -- cgit v1.2.3 From c5f2c76643b612ffa47e4660c8f44deba619b068 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:55 -0700 Subject: KVM: VMX: Shadow VMCS pin controls Prepare to shadow all major control fields on a per-VMCS basis, which allows KVM to avoid costly VMWRITEs when switching between vmcs01 and vmcs02. Shadowing pin controls also allows a future patch to remove the per-VMCS 'hv_timer_armed' flag, as the shadow copy is a superset of said flag. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 10 ++++------ arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index cd33f3109f24..8617d305fbbd 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -285,6 +285,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); + pin_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); } @@ -2026,7 +2027,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } else { exec_control &= ~PIN_BASED_POSTED_INTR; } - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); + pin_controls_init(vmx, exec_control); /* * EXEC CONTROLS diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0e722fa5e189..a709afb04c28 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3844,7 +3844,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, @@ -4042,7 +4042,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + pin_controls_init(vmx, vmx_pin_based_exec_ctrl(vmx)); vmx->hv_deadline_tsc = -1; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -6366,8 +6366,7 @@ static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); if (!vmx->loaded_vmcs->hv_timer_armed) - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + pin_controls_setbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); vmx->loaded_vmcs->hv_timer_armed = true; } @@ -6396,8 +6395,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) } if (vmx->loaded_vmcs->hv_timer_armed) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + pin_controls_clearbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); vmx->loaded_vmcs->hv_timer_armed = false; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index db4f9289d5da..1cdcaf8a6d97 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -88,6 +88,7 @@ struct pt_desc { struct vmx_controls_shadow { u32 vm_entry; u32 vm_exit; + u32 pin; }; /* @@ -423,6 +424,7 @@ static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ } BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { -- cgit v1.2.3 From 2183f5645ae7e074ed1777f3de9a782dd23db248 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:56 -0700 Subject: KVM: VMX: Shadow VMCS primary execution controls Prepare to shadow all major control fields on a per-VMCS basis, which allows KVM to avoid VMREADs when switching between vmcs01 and vmcs02, and more importantly can eliminate costly VMWRITEs to controls when preparing vmcs02. Shadowing exec controls also saves a VMREAD when opening virtual INTR/NMI windows, yay... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 14 ++++++-------- arch/x86/kvm/vmx/vmx.c | 38 +++++++++++++++----------------------- arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 23 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8617d305fbbd..3f80df186476 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -286,6 +286,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); pin_controls_reset_shadow(vmx); + exec_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); } @@ -2052,7 +2053,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) */ exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + exec_controls_init(vmx, exec_control); /* * SECONDARY EXEC CONTROLS @@ -2873,8 +2874,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * _not_ what the processor does but it's basically the * only possibility we have. */ - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_TPR_SHADOW); + exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); } else { /* * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to @@ -2896,11 +2896,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); + exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); + exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); } /* @@ -2953,7 +2951,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) u32 exit_reason = EXIT_REASON_INVALID_STATE; u32 exit_qual; - evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a709afb04c28..a2c6f63d2adc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2796,22 +2796,20 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) vmx_decache_cr3(vcpu); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | - (CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); + exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); + exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } @@ -4045,7 +4043,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) pin_controls_init(vmx, vmx_pin_based_exec_ctrl(vmx)); vmx->hv_deadline_tsc = -1; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); + exec_controls_init(vmx, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); @@ -4235,8 +4233,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void enable_irq_window(struct kvm_vcpu *vcpu) { - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -4247,8 +4244,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4795,8 +4791,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) } if (vcpu->guest_debug == 0) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_MOV_DR_EXITING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); /* * No more DR vmexits; force a reload of the debug registers @@ -4840,7 +4835,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -4900,8 +4895,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5155,8 +5149,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!enable_vnmi); - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5168,7 +5161,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; int ret = 1; - u32 cpu_exec_ctrl; bool intr_window_requested; unsigned count = 130; @@ -5179,8 +5171,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; + intr_window_requested = exec_controls_get(vmx) & + CPU_BASED_VIRTUAL_INTR_PENDING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1cdcaf8a6d97..754cc52f2640 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -89,6 +89,7 @@ struct vmx_controls_shadow { u32 vm_entry; u32 vm_exit; u32 pin; + u32 exec; }; /* @@ -425,6 +426,7 @@ static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { -- cgit v1.2.3 From fe7f895dae4fa5725c0459b316b3e9ee814d2583 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:57 -0700 Subject: KVM: VMX: Shadow VMCS secondary execution controls Prepare to shadow all major control fields on a per-VMCS basis, which allows KVM to avoid costly VMWRITEs when switching between vmcs01 and vmcs02. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 40 ++++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 28 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3f80df186476..21cb6cd88765 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -192,7 +192,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } @@ -287,6 +287,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vm_exit_controls_reset_shadow(vmx); pin_controls_reset_shadow(vmx); exec_controls_reset_shadow(vmx); + secondary_exec_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); } @@ -2083,7 +2084,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + secondary_exec_controls_init(vmx, exec_control); } /* @@ -2853,8 +2854,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); } } @@ -4667,8 +4668,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) { vmx->nested.current_vmptr = vmptr; if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.need_vmcs12_to_shadow_sync = true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a2c6f63d2adc..8c1cbc19af97 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2909,6 +2909,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control @@ -2919,20 +2920,19 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; - else if (to_vmx(vcpu)->rmode.vm86_active) + else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { if (cr4 & X86_CR4_UMIP) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); + } } if (cr4 & X86_CR4_VMXE) { @@ -2947,7 +2947,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) + if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -3565,7 +3565,7 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) u8 mode = 0; if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + (secondary_exec_controls_get(to_vmx(vcpu)) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { mode |= MSR_BITMAP_MODE_X2APIC; if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) @@ -3845,11 +3845,11 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); else - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } @@ -4047,8 +4047,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - vmx->secondary_exec_control); + secondary_exec_controls_init(vmx, vmx->secondary_exec_control); } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { @@ -5969,6 +5968,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 sec_exec_control; if (!lapic_in_kernel(vcpu)) @@ -5980,11 +5980,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; + vmx->nested.change_vmcs01_virtual_apic_mode = true; return; } - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control = secondary_exec_controls_get(vmx); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -6006,7 +6006,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; break; } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + secondary_exec_controls_set(vmx, sec_exec_control); vmx_update_msr_bitmap(vcpu); } @@ -6827,7 +6827,7 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } -static void vmcs_set_secondary_exec_control(u32 new_ctl) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) { /* * These bits in the secondary execution controls field @@ -6841,10 +6841,10 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + u32 new_ctl = vmx->secondary_exec_control; + u32 cur_ctl = secondary_exec_controls_get(vmx); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - (new_ctl & ~mask) | (cur_ctl & mask)); + secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); } /* @@ -6982,7 +6982,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx->secondary_exec_control); + vmcs_set_secondary_exec_control(vmx); } if (nested_vmx_allowed(vcpu)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 754cc52f2640..bde6c43eea16 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -90,6 +90,7 @@ struct vmx_controls_shadow { u32 vm_exit; u32 pin; u32 exec; + u32 secondary_exec; }; /* @@ -427,6 +428,7 @@ BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { -- cgit v1.2.3 From 09e226cf07e6bf85d885afa43fabc02b88dc1652 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:58 -0700 Subject: KVM: nVMX: Shadow VMCS controls on a per-VMCS basis ... to pave the way for not preserving the shadow copies across switches between vmcs01 and vmcs02, and eventually to avoid VMWRITEs to vmcs02 when the desired value is unchanged across nested VM-Enters. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs.h | 9 +++++++++ arch/x86/kvm/vmx/vmx.h | 22 +++++++--------------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 971a46c69df4..52f12d78e4fa 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -42,6 +42,14 @@ struct vmcs_host_state { #endif }; +struct vmcs_controls_shadow { + u32 vm_entry; + u32 vm_exit; + u32 pin; + u32 exec; + u32 secondary_exec; +}; + /* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs @@ -61,6 +69,7 @@ struct loaded_vmcs { unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; struct vmcs_host_state host_state; + struct vmcs_controls_shadow controls_shadow; }; static inline bool is_exception_n(u32 intr_info, u8 vector) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bde6c43eea16..ec11ecf6f040 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -85,14 +85,6 @@ struct pt_desc { struct pt_ctx guest; }; -struct vmx_controls_shadow { - u32 vm_entry; - u32 vm_exit; - u32 pin; - u32 exec; - u32 secondary_exec; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -209,8 +201,6 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct vmx_controls_shadow controls_shadow; - struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; @@ -400,21 +390,23 @@ static inline u8 vmx_get_rvi(void) #define BUILD_CONTROLS_SHADOW(lname, uname) \ static inline void lname##_controls_reset_shadow(struct vcpu_vmx *vmx) \ { \ - vmx->controls_shadow.lname = vmcs_read32(uname); \ + vmx->loaded_vmcs->controls_shadow.lname = vmcs_read32(uname); \ } \ static inline void lname##_controls_init(struct vcpu_vmx *vmx, u32 val) \ { \ vmcs_write32(uname, val); \ - vmx->controls_shadow.lname = val; \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ { \ - if (vmx->controls_shadow.lname != val) \ - lname##_controls_init(vmx, val); \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write32(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ } \ static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ { \ - return vmx->controls_shadow.lname; \ + return vmx->loaded_vmcs->controls_shadow.lname; \ } \ static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ { \ -- cgit v1.2.3 From ae81d08993cbc515e3181ee6bebce5cd878133f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:17:59 -0700 Subject: KVM: nVMX: Don't reset VMCS controls shadow on VMCS switch ... now that the shadow copies are per-VMCS. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 ----- arch/x86/kvm/vmx/vmx.h | 4 ---- 2 files changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 21cb6cd88765..d4f529a2e194 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -283,11 +283,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vm_entry_controls_reset_shadow(vmx); - vm_exit_controls_reset_shadow(vmx); - pin_controls_reset_shadow(vmx); - exec_controls_reset_shadow(vmx); - secondary_exec_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ec11ecf6f040..52d7bc90d9ef 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -388,10 +388,6 @@ static inline u8 vmx_get_rvi(void) } #define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_reset_shadow(struct vcpu_vmx *vmx) \ -{ \ - vmx->loaded_vmcs->controls_shadow.lname = vmcs_read32(uname); \ -} \ static inline void lname##_controls_init(struct vcpu_vmx *vmx, u32 val) \ { \ vmcs_write32(uname, val); \ -- cgit v1.2.3 From 3af80fec6e7fe2e89aa131a0ebdb90be780668f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:18:00 -0700 Subject: KVM: VMX: Explicitly initialize controls shadow at VMCS allocation Or: Don't re-initialize vmcs02's controls on every nested VM-Entry. VMWRITEs to the major VMCS controls are deceptively expensive. Intel CPUs with VMCS caching (Westmere and later) also optimize away consistency checks on VM-Entry, i.e. skip consistency checks if the relevant fields have not changed since the last successful VM-Entry (of the cached VMCS). Because uops are a precious commodity, uCode's dirty VMCS field tracking isn't as precise as software would prefer. Notably, writing any of the major VMCS fields effectively marks the entire VMCS dirty, i.e. causes the next VM-Entry to perform all consistency checks, which consumes several hundred cycles. Zero out the controls' shadow copies during VMCS allocation and use the optimized setter when "initializing" controls. While this technically affects both non-nested and nested virtualization, nested virtualization is the primary beneficiary as avoid VMWRITEs when prepare vmcs02 allows hardware to optimizie away consistency checks. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 10 +++++----- arch/x86/kvm/vmx/vmx.c | 12 +++++++----- arch/x86/kvm/vmx/vmx.h | 5 ----- 3 files changed, 12 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d4f529a2e194..3f76a1f3fe3c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2024,7 +2024,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } else { exec_control &= ~PIN_BASED_POSTED_INTR; } - pin_controls_init(vmx, exec_control); + pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS @@ -2049,7 +2049,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) */ exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; - exec_controls_init(vmx, exec_control); + exec_controls_set(vmx, exec_control); /* * SECONDARY EXEC CONTROLS @@ -2079,7 +2079,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); - secondary_exec_controls_init(vmx, exec_control); + secondary_exec_controls_set(vmx, exec_control); } /* @@ -2098,7 +2098,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (guest_efer != host_efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } - vm_entry_controls_init(vmx, exec_control); + vm_entry_controls_set(vmx, exec_control); /* * EXIT CONTROLS @@ -2110,7 +2110,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control = vmx_vmexit_ctrl(); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; - vm_exit_controls_init(vmx, exec_control); + vm_exit_controls_set(vmx, exec_control); /* * Interrupt/Exception Fields diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8c1cbc19af97..bae376bf9c20 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2485,6 +2485,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + memset(&loaded_vmcs->controls_shadow, 0, + sizeof(struct vmcs_controls_shadow)); return 0; @@ -4040,14 +4042,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - pin_controls_init(vmx, vmx_pin_based_exec_ctrl(vmx)); + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); vmx->hv_deadline_tsc = -1; - exec_controls_init(vmx, vmx_exec_control(vmx)); + exec_controls_set(vmx, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); - secondary_exec_controls_init(vmx, vmx->secondary_exec_control); + secondary_exec_controls_set(vmx, vmx->secondary_exec_control); } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { @@ -4105,10 +4107,10 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); + vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ - vm_entry_controls_init(vmx, vmx_vmentry_ctrl()); + vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 52d7bc90d9ef..82d0bc3a4d52 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -388,11 +388,6 @@ static inline u8 vmx_get_rvi(void) } #define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_init(struct vcpu_vmx *vmx, u32 val) \ -{ \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ -} \ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ { \ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ -- cgit v1.2.3 From de0286b7884a6a3309e299dda876810faa281547 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:18:01 -0700 Subject: KVM: nVMX: Preserve last USE_MSR_BITMAPS when preparing vmcs02 KVM dynamically toggles the CPU_BASED_USE_MSR_BITMAPS execution control for nested guests based on whether or not both L0 and L1 want to pass through the same MSRs to L2. Preserve the last used value from vmcs02 so as to avoid multiple VMWRITEs to (re)set/(re)clear the bit on nested VM-Entry. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3f76a1f3fe3c..54b7726c5bf5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2047,8 +2047,18 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * A vmexit (to either L1 hypervisor or L0 userspace) is always needed * for I/O port accesses. */ - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + + /* + * This bit will be computed in nested_get_vmcs12_pages, because + * we do not have access to L1's MSR bitmap yet. For now, keep + * the same bit as before, hoping to avoid multiple VMWRITEs that + * only set/clear this bit. + */ + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; + exec_controls_set(vmx, exec_control); /* -- cgit v1.2.3 From 469debdb8be5814c71c146ce8e21ae7363ae644d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:18:02 -0700 Subject: KVM: nVMX: Preset *DT exiting in vmcs02 when emulating UMIP KVM dynamically toggles SECONDARY_EXEC_DESC to intercept (a subset of) instructions that are subject to User-Mode Instruction Prevention, i.e. VMCS.SECONDARY_EXEC_DESC == CR4.UMIP when emulating UMIP. Preset the VMCS control when preparing vmcs02 to avoid unnecessarily VMWRITEs, e.g. KVM will clear VMCS.SECONDARY_EXEC_DESC in prepare_vmcs02_early() and then set it in vmx_set_cr4(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 54b7726c5bf5..23c310adea7d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2085,6 +2085,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + /* + * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() + * will not have to rewrite the controls just for this bit. + */ + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && + (vmcs12->guest_cr4 & X86_CR4_UMIP)) + exec_control |= SECONDARY_EXEC_DESC; + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); -- cgit v1.2.3 From 9d99cc49a483d4e73fbc1b5ec27621d67e773ecd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:18:03 -0700 Subject: KVM: VMX: Drop hv_timer_armed from 'struct loaded_vmcs' ... now that it is fully redundant with the pin controls shadow. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 1 - arch/x86/kvm/vmx/vmcs.h | 1 - arch/x86/kvm/vmx/vmx.c | 8 ++------ 3 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23c310adea7d..22e02f10ece9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2015,7 +2015,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control |= vmcs12->pin_based_vm_exec_control; /* Preemption timer setting is computed directly in vmx_vcpu_run. */ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - vmx->loaded_vmcs->hv_timer_armed = false; /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 52f12d78e4fa..9a87a2482e3e 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -61,7 +61,6 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - bool hv_timer_armed; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bae376bf9c20..09f1e0b70316 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6359,9 +6359,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); - if (!vmx->loaded_vmcs->hv_timer_armed) - pin_controls_setbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = true; + pin_controls_setbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) @@ -6388,9 +6386,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) return; } - if (vmx->loaded_vmcs->hv_timer_armed) - pin_controls_clearbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = false; + pin_controls_clearbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); } void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) -- cgit v1.2.3 From 804939ea200d421fb7f3bf9eefebc38c255dd624 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 May 2019 12:18:05 -0700 Subject: KVM: VMX: Leave preemption timer running when it's disabled VMWRITEs to the major VMCS controls, pin controls included, are deceptively expensive. CPUs with VMCS caching (Westmere and later) also optimize away consistency checks on VM-Entry, i.e. skip consistency checks if the relevant fields have not changed since the last successful VM-Entry (of the cached VMCS). Because uops are a precious commodity, uCode's dirty VMCS field tracking isn't as precise as software would prefer. Notably, writing any of the major VMCS fields effectively marks the entire VMCS dirty, i.e. causes the next VM-Entry to perform all consistency checks, which consumes several hundred cycles. As it pertains to KVM, toggling PIN_BASED_VMX_PREEMPTION_TIMER more than doubles the latency of the next VM-Entry (and again when/if the flag is toggled back). In a non-nested scenario, running a "standard" guest with the preemption timer enabled, toggling the timer flag is uncommon but not rare, e.g. roughly 1 in 10 entries. Disabling the preemption timer can change these numbers due to its use for "immediate exits", even when explicitly disabled by userspace. Nested virtualization in particular is painful, as the timer flag is set for the majority of VM-Enters, but prepare_vmcs02() initializes vmcs02's pin controls to *clear* the flag since its the timer's final state isn't known until vmx_vcpu_run(). I.e. the majority of nested VM-Enters end up unnecessarily writing pin controls *twice*. Rather than toggle the timer flag in pin controls, set the timer value itself to the largest allowed value to put it into a "soft disabled" state, and ignore any spurious preemption timer exits. Sadly, the timer is a 32-bit value and so theoretically it can fire before the head death of the universe, i.e. spurious exits are possible. But because KVM does *not* save the timer value on VM-Exit and because the timer runs at a slower rate than the TSC, the maximuma timer value is still sufficiently large for KVM's purposes. E.g. on a modern CPU with a timer that runs at 1/32 the frequency of a 2.4ghz constant-rate TSC, the timer will fire after ~55 seconds of *uninterrupted* guest execution. In other words, spurious VM-Exits are effectively only possible if the host is completely tickless on the logical CPU, the guest is not using the preemption timer, and the guest is not generating VM-Exits for any other reason. To be safe from bad/weird hardware, disable the preemption timer if its maximum delay is less than ten seconds. Ten seconds is mostly arbitrary and was selected in no small part because it's a nice round number. For simplicity and paranoia, fall back to __kvm_request_immediate_exit() if the preemption timer is disabled by KVM or userspace. Previously KVM continued to use the preemption timer to force immediate exits even when the timer was disabled by userspace. Now that KVM leaves the timer running instead of truly disabling it, allow userspace to kill it entirely in the unlikely event the timer (or KVM) malfunctions. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 ++-- arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 60 ++++++++++++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 22e02f10ece9..990e543f4531 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2012,9 +2012,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * PIN CONTROLS */ exec_control = vmx_pin_based_exec_ctrl(vmx); - exec_control |= vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is computed directly in vmx_vcpu_run. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= (vmcs12->pin_based_vm_exec_control & + ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 9a87a2482e3e..481ad879197b 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -61,6 +61,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + bool hv_timer_soft_disabled; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 09f1e0b70316..b939a688ae83 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2465,6 +2465,7 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) return -ENOMEM; loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs->hv_timer_soft_disabled = false; loaded_vmcs_init(loaded_vmcs); if (cpu_has_vmx_msr_bitmap()) { @@ -3835,8 +3836,9 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!enable_vnmi) pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; - /* Enable the preemption timer dynamically */ - pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + if (!enable_preemption_timer) + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + return pin_based_exec_ctrl; } @@ -5455,8 +5457,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - if (!to_vmx(vcpu)->req_immediate_exit) + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->req_immediate_exit && + !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) kvm_lapic_expired_hv_timer(vcpu); + return 1; } @@ -6356,12 +6362,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); - pin_controls_setbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); -} - static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6369,11 +6369,9 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) u32 delta_tsc; if (vmx->req_immediate_exit) { - vmx_arm_hv_timer(vmx, 0); - return; - } - - if (vmx->hv_deadline_tsc != -1) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (vmx->hv_deadline_tsc != -1) { tscl = rdtsc(); if (vmx->hv_deadline_tsc > tscl) /* set_hv_timer ensures the delta fits in 32-bits */ @@ -6382,11 +6380,12 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) else delta_tsc = 0; - vmx_arm_hv_timer(vmx, delta_tsc); - return; + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); + vmx->loaded_vmcs->hv_timer_soft_disabled = true; } - - pin_controls_clearbit(vmx, PIN_BASED_VMX_PREEMPTION_TIMER); } void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) @@ -6458,7 +6457,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_update_hv_timer(vcpu); + if (enable_preemption_timer) + vmx_update_hv_timer(vcpu); if (lapic_in_kernel(vcpu) && vcpu->arch.apic->lapic_timer.timer_advance_ns) @@ -7565,17 +7565,33 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_preemption_timer()) - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + enable_preemption_timer = false; - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + if (enable_preemption_timer) { + u64 use_timer_freq = 5000ULL * 1000 * 1000; u64 vmx_msr; rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); cpu_preemption_timer_multi = vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { + + if (tsc_khz) + use_timer_freq = (u64)tsc_khz * 1000; + use_timer_freq >>= cpu_preemption_timer_multi; + + /* + * KVM "disables" the preemption timer by setting it to its max + * value. Don't use the timer if it might cause spurious exits + * at a rate faster than 0.1 Hz (of uninterrupted guest time). + */ + if (use_timer_freq > 0xffffffffu / 10) + enable_preemption_timer = false; + } + + if (!enable_preemption_timer) { kvm_x86_ops->set_hv_timer = NULL; kvm_x86_ops->cancel_hv_timer = NULL; + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; } kvm_set_posted_intr_wakeup_handler(wakeup_handler); -- cgit v1.2.3 From eceb9973d908cb327372bf0d04f2d1ee0b27bff5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Jun 2019 16:16:39 +0200 Subject: KVM: nVMX: shadow pin based execution controls The VMX_PREEMPTION_TIMER flag may be toggled frequently, though not *very* frequently. Since it does not affect KVM's dirty logic, e.g. the preemption timer value is loaded from vmcs12 even if vmcs12 is "clean", there is no need to mark vmcs12 dirty when L1 writes pin controls, and shadowing the field achieves that. Reviewed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 4cea018ba285..eb1ecd16fd22 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -47,6 +47,7 @@ SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code) SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes) SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes) SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control) +SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control) SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap) SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code) SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field) -- cgit v1.2.3 From 4d763b168e9c5c366b05812c7bba7662e5ea3669 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 20 Jun 2019 17:00:02 +0800 Subject: KVM: VMX: check CPUID before allowing read/write of IA32_XSS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Raise #GP when guest read/write IA32_XSS, but the CPUID bits say that it shouldn't exist. Fixes: 203000993de5 (kvm: vmx: add MSR logic for XSAVES) Reported-by: Xiaoyao Li Reported-by: Tao Xu Cc: Paolo Bonzini Cc: Radim Krčmář Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b939a688ae83..a35459ce7e29 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1732,7 +1732,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data); case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) + if (!vmx_xsaves_supported() || + (!msr_info->host_initiated && + !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) return 1; msr_info->data = vcpu->arch.ia32_xss; break; @@ -1962,7 +1965,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) + if (!vmx_xsaves_supported() || + (!msr_info->host_initiated && + !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) return 1; /* * The only supported bit as of Skylake is bit 8, but -- cgit v1.2.3 From a251fb90ab8a3e6efb2b4e14923ddb4421317f65 Mon Sep 17 00:00:00 2001 From: Saar Amar Date: Mon, 6 May 2019 11:29:16 +0300 Subject: KVM: x86: Fix apic dangling pointer in vcpu The function kvm_create_lapic() attempts to allocate the apic structure and sets a pointer to it in the virtual processor structure. However, if get_zeroed_page() failed, the function frees the apic chunk, but forgets to set the pointer in the vcpu to NULL. It's not a security issue since there isn't a use of that pointer if kvm_create_lapic() returns error, but it's more accurate that way. Signed-off-by: Saar Amar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e82a18ccfc1a..d6ca5c4f29f1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2339,6 +2339,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) return 0; nomem_free_apic: kfree(apic); + vcpu->arch.apic = NULL; nomem: return -ENOMEM; } -- cgit v1.2.3 From 6defc591846d0104d1a95f2fb450bdb05a9e738b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 2 Jul 2019 14:39:29 +0200 Subject: KVM: nVMX: include conditional controls in /dev/kvm KVM_GET_MSRS Some secondary controls are automatically enabled/disabled based on the CPUID values that are set for the guest. However, they are still available at a global level and therefore should be present when KVM_GET_MSRS is sent to /dev/kvm. Fixes: 1389309c811 ("KVM: nVMX: expose VMX capabilities for nested hypervisors to userspace", 2018-02-26) Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 990e543f4531..c4e29ef0b21e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5750,10 +5750,15 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | + SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_RDRAND_EXITING | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_XSAVES; /* * We can emulate "VMCS shadowing," even if the hardware -- cgit v1.2.3 From e8a70bd4e92525de459acfa5668c8e1f24664331 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 2 Jul 2019 14:40:40 +0200 Subject: KVM: nVMX: allow setting the VMFUNC controls MSR Allow userspace to set a custom value for the VMFUNC controls MSR, as long as the capabilities it advertises do not exceed those of the host. Fixes: 27c42a1bb ("KVM: nVMX: Enable VMFUNC for the L1 hypervisor", 2017-08-03) Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c4e29ef0b21e..163d226efa96 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1234,6 +1234,11 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_VMX_VMCS_ENUM: vmx->nested.msrs.vmcs_enum = data; return 0; + case MSR_IA32_VMX_VMFUNC: + if (data & ~vmx->nested.msrs.vmfunc_controls) + return -EINVAL; + vmx->nested.msrs.vmfunc_controls = data; + return 0; default: /* * The rest of the VMX capability MSRs do not support restore. -- cgit v1.2.3 From 95c5c7c77c06c7037385b3d8d4d7592ab032c3cb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 2 Jul 2019 14:45:24 +0200 Subject: KVM: nVMX: list VMX MSRs in KVM_GET_MSR_INDEX_LIST This allows userspace to know which MSRs are supported by the hypervisor. Unfortunately userspace must resort to tricks for everything except MSR_IA32_VMX_VMFUNC (which was just added in the previous patch). One possibility is to use the feature control MSR, which is tied to nested VMX as well and is present on all KVM versions that support feature MSRs. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bbc31f7213ed..5db50c19d1c7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5885,6 +5885,7 @@ static bool svm_has_emulated_msr(int index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; default: break; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a35459ce7e29..c43635942693 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6223,6 +6223,8 @@ static bool vmx_has_emulated_msr(int index) * real mode. */ return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return nested; case MSR_AMD64_VIRT_SPEC_CTRL: /* This is AMD only. */ return false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e536a2b2b0e8..721af7b46b24 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1176,6 +1176,26 @@ static u32 emulated_msrs[] = { MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + /* + * The following list leaves out MSRs whose values are determined + * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. + * We always support the "true" VMX control MSRs, even if the host + * processor does not, so I am putting these registers here rather + * than in msrs_to_save. + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + MSR_K7_HWCR, MSR_KVM_POLL_CONTROL, }; -- cgit v1.2.3 From a21a39c206f8a541ce9670666c0025d73383aa1a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 28 Jun 2019 13:23:32 +0200 Subject: x86/KVM/nVMX: don't use clean fields data on enlightened VMLAUNCH Apparently, Windows doesn't maintain clean fields data after it does VMCLEAR for an enlightened VMCS so we can only use it on VMRESUME. The issue went unnoticed because currently we do nested_release_evmcs() in handle_vmclear() and the consecutive enlightened VMPTRLD invalidates clean fields when a new eVMCS is mapped but we're going to change the logic. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 163d226efa96..2d0fba643c06 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1784,6 +1784,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_vp_assist_page assist_page; + bool evmcs_gpa_changed = false; if (likely(!vmx->nested.enlightened_vmcs_enabled)) return 1; @@ -1837,15 +1838,9 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } vmx->nested.dirty_vmcs12 = true; - /* - * As we keep L2 state for one guest only 'hv_clean_fields' mask - * can't be used when we switch between them. Reset it here for - * simplicity. - */ - vmx->nested.hv_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + evmcs_gpa_changed = true; /* * Unlike normal vmcs12, enlightened vmcs12 is not fully * reloaded from guest's memory (read only fields, fields not @@ -1859,6 +1854,15 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } } + + /* + * Clean fields data can't de used on VMLAUNCH and when we switch + * between different L2 guests as KVM keeps a single VMCS12 per L1. + */ + if (from_launch || evmcs_gpa_changed) + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + return 1; } @@ -3092,7 +3096,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) + if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) return 1; if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) -- cgit v1.2.3 From 11e349143e3289ab99f1b4858649f60703b7bf35 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 28 Jun 2019 13:23:33 +0200 Subject: x86/kvm/nVMX: fix VMCLEAR when Enlightened VMCS is in use When Enlightened VMCS is in use, it is valid to do VMCLEAR and, according to TLFS, this should "transition an enlightened VMCS from the active to the non-active state". It is, however, wrong to assume that it is only valid to do VMCLEAR for the eVMCS which is currently active on the vCPU performing VMCLEAR. Currently, the logic in handle_vmclear() is broken: in case, there is no active eVMCS on the vCPU doing VMCLEAR we treat the argument as a 'normal' VMCS and kvm_vcpu_write_guest() to the 'launch_state' field irreversibly corrupts the memory area. So, in case the VMCLEAR argument is not the current active eVMCS on the vCPU, how can we know if the area it is pointing to is a normal or an enlightened VMCS? Thanks to the bug in Hyper-V (see commit 72aeb60c52bf7 ("KVM: nVMX: Verify eVMCS revision id match supported eVMCS version on eVMCS VMPTRLD")) we can not, the revision can't be used to distinguish between them. So let's assume it is always enlightened in case enlightened vmentry is enabled in the assist page. Also, check if vmx->nested.enlightened_vmcs_enabled to minimize the impact for 'unenlightened' workloads. Fixes: b8bbab928fb1 ("KVM: nVMX: implement enlightened VMPTRLD and VMCLEAR") Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 18 ++++++++++++++++++ arch/x86/kvm/vmx/evmcs.h | 1 + arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++-------------- 3 files changed, 37 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 5466c6d85cf3..72359709cdc1 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -3,6 +3,7 @@ #include #include +#include "../hyperv.h" #include "evmcs.h" #include "vmcs.h" #include "vmx.h" @@ -313,6 +314,23 @@ void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) } #endif +bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) +{ + struct hv_vp_assist_page assist_page; + + *evmcs_gpa = -1ull; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return false; + + if (unlikely(!assist_page.enlighten_vmentry)) + return false; + + *evmcs_gpa = assist_page.current_nested_vmcs; + + return true; +} + uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index e0fcef85b332..39a24eec8884 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -195,6 +195,7 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2d0fba643c06..509841916d23 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1783,27 +1783,22 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, bool from_launch) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_vp_assist_page assist_page; bool evmcs_gpa_changed = false; + u64 evmcs_gpa; if (likely(!vmx->nested.enlightened_vmcs_enabled)) return 1; - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) return 1; - if (unlikely(!assist_page.enlighten_vmentry)) - return 1; - - if (unlikely(assist_page.current_nested_vmcs != - vmx->nested.hv_evmcs_vmptr)) { - + if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { if (!vmx->nested.hv_evmcs) vmx->nested.current_vmptr = -1ull; nested_release_evmcs(vcpu); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs), + if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) return 0; @@ -1838,7 +1833,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } vmx->nested.dirty_vmcs12 = true; - vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + vmx->nested.hv_evmcs_vmptr = evmcs_gpa; evmcs_gpa_changed = true; /* @@ -4436,6 +4431,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; + u64 evmcs_gpa; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4451,10 +4447,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - if (vmx->nested.hv_evmcs_map.hva) { - if (vmptr == vmx->nested.hv_evmcs_vmptr) - nested_release_evmcs(vcpu); - } else { + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (likely(!vmx->nested.enlightened_vmcs_enabled || + !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); -- cgit v1.2.3 From f85f6e7bc9682a6d8b342c010cd6aa58521fdeec Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 11 Jun 2019 20:23:48 +0800 Subject: KVM: X86: Yield to IPI target if necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sending a call-function IPI-many to vCPUs, yield if any of the IPI target vCPUs was preempted, we just select the first preempted target vCPU which we found since the state of target vCPUs can change underneath and to avoid race conditions. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/hypercalls.txt | 11 +++++++++++ arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kernel/kvm.c | 21 +++++++++++++++++++++ include/uapi/linux/kvm_para.h | 1 + 4 files changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index da24c138c8d1..da210651f714 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -141,3 +141,14 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1 corresponds to the APIC ID a2+1, and so on. Returns the number of CPUs to which the IPIs were delivered successfully. + +7. KVM_HC_SCHED_YIELD +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to yield if the IPI target vCPU is preempted + +a0: destination APIC ID + +Usage example: When sending a call-function IPI-many to vCPUs, yield if +any of the IPI target vCPUs was preempted. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 21d5f0240595..2a8e0b6b9805 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -30,6 +30,7 @@ #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 +#define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5169b8cc35bb..82caf01b63dd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -527,6 +527,21 @@ static void kvm_setup_pv_ipi(void) pr_info("KVM setup pv IPIs\n"); } +static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) +{ + int cpu; + + native_send_call_func_ipi(mask); + + /* Make sure other vCPUs get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (vcpu_is_preempted(cpu)) { + kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); + break; + } + } +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -638,6 +653,12 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; + pr_info("KVM setup pv sched yield\n"); + } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index 6c0ce49931e5..8b86609849b9 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 #define KVM_HC_CLOCK_PAIRING 9 #define KVM_HC_SEND_IPI 10 +#define KVM_HC_SCHED_YIELD 11 /* * hypercalls use architecture specific -- cgit v1.2.3 From 715062970f3748fbbf55c1a8cb495051760a65c1 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 11 Jun 2019 20:23:49 +0800 Subject: KVM: X86: Implement PV sched yield hypercall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The target vCPUs are in runnable state after vcpu_kick and suitable as a yield target. This patch implements the sched yield hypercall. 17% performance increasement of ebizzy benchmark can be observed in an over-subscribe environment. (w/ kvm-pv-tlb disabled, testing TLB flush call-function IPI-many since call-function is not easy to be trigged by userspace workload). Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 721af7b46b24..2e302e977dac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7193,6 +7193,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } +static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +{ + struct kvm_vcpu *target = NULL; + struct kvm_apic_map *map; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + + rcu_read_unlock(); + + if (target) + kvm_vcpu_yield_to(target); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -7239,6 +7256,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; + case KVM_HC_SCHED_YIELD: + kvm_sched_yield(vcpu->kvm, a0); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; -- cgit v1.2.3 From 32b72ecc83b651fb8633ac4bd44957c54367699d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 11 Jun 2019 20:23:50 +0800 Subject: KVM: X86: Expose PV_SCHED_YIELD CPUID feature bit to guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose PV_SCHED_YIELD feature bit to guest, the guest can check this feature bit before using paravirtualized sched yield. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/cpuid.txt | 4 ++++ arch/x86/kvm/cpuid.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 97ca1940a0dc..979a77ba5377 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -66,6 +66,10 @@ KVM_FEATURE_PV_SEND_IPI || 11 || guest checks this feature bit || || before using paravirtualized || || send IPIs. ------------------------------------------------------------------------------ +KVM_FEATURE_PV_SCHED_YIELD || 13 || guest checks this feature bit + || || before using paravirtualized + || || sched yield. +------------------------------------------------------------------------------ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 68c74e948e28..004cbd84c351 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -659,7 +659,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_PV_TLB_FLUSH) | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | - (1 << KVM_FEATURE_POLL_CONTROL); + (1 << KVM_FEATURE_POLL_CONTROL) | + (1 << KVM_FEATURE_PV_SCHED_YIELD); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); -- cgit v1.2.3 From b119019847fbcac36ed1384f166c91f177d070e7 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 13 Jun 2019 09:16:08 -0700 Subject: kvm: nVMX: Remove unnecessary sync_roots from handle_invept When L0 is executing handle_invept(), the TDP MMU is active. Emulating an L1 INVEPT does require synchronizing the appropriate shadow EPT root(s), but a call to kvm_mmu_sync_roots in this context won't do that. Similarly, the hardware TLB and paging-structure-cache entries associated with the appropriate shadow EPT root(s) must be flushed, but requesting a TLB_FLUSH from this context won't do that either. How did this ever work? KVM always does a sync_roots and TLB flush (in the correct context) when transitioning from L1 to L2. That isn't the best choice for nested VM performance, but it effectively papers over the mistakes here. Remove the unnecessary operations and leave a comment to try to do better in the future. Reported-by: Junaid Shahid Fixes: bfd0a56b90005f ("nEPT: Nested INVEPT") Cc: Xiao Guangrong Cc: Nadav Har'El Cc: Jun Nakajima Cc: Xinhao Xu Cc: Yang Zhang Cc: Gleb Natapov Cc: Paolo Bonzini Reviewed-by Peter Shier Reviewed-by: Junaid Shahid Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 509841916d23..118d185764ec 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4835,13 +4835,11 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: + case VMX_EPT_EXTENT_CONTEXT: /* - * TODO: track mappings and invalidate - * single context requests appropriately + * TODO: Sync the necessary shadow EPT roots here, rather than + * at the next emulated VM-entry. */ - case VMX_EPT_EXTENT_CONTEXT: - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); break; default: BUG_ON(1); -- cgit v1.2.3 From c550505b57832936aebb5eb376375bc258f8a807 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 27 Jun 2019 11:36:51 -0700 Subject: kvm: x86: Pass through AMD_STIBP_ALWAYS_ON in GET_SUPPORTED_CPUID This bit is purely advisory. Passing it through to the guest indicates that the virtual processor, like the physical processor, prefers that STIBP is only set once during boot and not changed. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 004cbd84c351..417687bc7386 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -390,7 +390,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP); + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = -- cgit v1.2.3 From d647eb63e67156fae44d2b467b1f314fc676de7a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 20 Jun 2019 14:13:33 +0200 Subject: KVM: svm: add nrips module parameter Allow testing code for old processors that lack the next RIP save feature, by disabling usage of the next_rip field. Nested hypervisors however get the feature unconditionally. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5db50c19d1c7..5270711e787f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -367,6 +367,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* enable/disable Next RIP Save */ +static int nrips = true; +module_param(nrips, int, 0444); + /* enable/disable Virtual VMLOAD VMSAVE */ static int vls = true; module_param(vls, int, 0444); @@ -773,7 +777,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->vmcb->control.next_rip != 0) { + if (nrips && svm->vmcb->control.next_rip != 0) { WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; } @@ -810,7 +814,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(&svm->vcpu); - if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { + if (nr == BP_VECTOR && !nrips) { unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); /* @@ -1367,6 +1371,11 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + if (avic) { if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC) || @@ -3938,7 +3947,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) { int err; - if (!static_cpu_has(X86_FEATURE_NRIPS)) + if (!nrips) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); -- cgit v1.2.3 From 7be373b6de503cd1eed6be4f2780925a5af03230 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 14 Jun 2019 09:15:48 +0800 Subject: KVM: LAPIC: remove the trailing newline used in the fmt parameter of TP_printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trailing newlines will lead to extra newlines in the trace file which looks like the following output, so remove it. qemu-system-x86-15695 [002] ...1 15774.839240: kvm_hv_timer_state: vcpu_id 0 hv_timer 1 qemu-system-x86-15695 [002] ...1 15774.839309: kvm_hv_timer_state: vcpu_id 0 hv_timer 1 Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4d47a2631d1f..b5c831e79094 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1365,7 +1365,7 @@ TRACE_EVENT(kvm_hv_timer_state, __entry->vcpu_id = vcpu_id; __entry->hv_timer_in_use = hv_timer_in_use; ), - TP_printk("vcpu_id %x hv_timer %x\n", + TP_printk("vcpu_id %x hv_timer %x", __entry->vcpu_id, __entry->hv_timer_in_use) ); -- cgit v1.2.3 From ab8bcf64971180e1344ce2c7e70c49b0f24f6b0d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jun 2019 10:23:33 +0200 Subject: KVM: cpuid: do_cpuid_ent works on a whole CPUID function Rename it as well as __do_cpuid_ent and __do_cpuid_ent_emulated to have "func" in its name, and drop the index parameter which is always 0. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 89 +++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 417687bc7386..85677d9188e3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -294,14 +294,19 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, { entry->function = function; entry->index = index; + entry->flags = 0; + cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; } -static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, - u32 func, u32 index, int *nent, int maxnent) +static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, int *nent, int maxnent) { + entry->function = func; + entry->index = 0; + entry->flags = 0; + switch (func) { case 0: entry->eax = 7; @@ -313,21 +318,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, break; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - if (index == 0) - entry->ecx = F(RDPID); + entry->eax = 0; + entry->ecx = F(RDPID); ++*nent; default: break; } - entry->function = func; - entry->index = index; - return 0; } -static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, + int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -431,7 +433,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (*nent >= maxnent) goto out; - do_cpuid_1_ent(entry, function, index); + do_cpuid_1_ent(entry, function, 0); ++*nent; switch (function) { @@ -496,34 +498,28 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 7: { entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capability word 9 */ - if (index == 0) { - entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_7_0_EBX); - // TSC_ADJUST is emulated - entry->ebx |= F(TSC_ADJUST); - entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; - f_la57 = entry->ecx & F(LA57); - cpuid_mask(&entry->ecx, CPUID_7_ECX); - /* Set LA57 based on hardware capability. */ - entry->ecx |= f_la57; - entry->ecx |= f_umip; - /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) - entry->ecx &= ~F(PKU); - entry->edx &= kvm_cpuid_7_0_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_7_EDX); - /* - * We emulate ARCH_CAPABILITIES in software even - * if the host doesn't support it. - */ - entry->edx |= F(ARCH_CAPABILITIES); - } else { - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - } entry->eax = 0; + /* Mask ebx against host capability word 9 */ + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + f_la57 = entry->ecx & F(LA57); + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* Set LA57 based on hardware capability. */ + entry->ecx |= f_la57; + entry->ecx |= f_umip; + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); break; } case 9: @@ -750,20 +746,19 @@ out: return r; } -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, - u32 idx, int *nent, int maxnent, unsigned int type) +static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, + int *nent, int maxnent, unsigned int type) { if (type == KVM_GET_EMULATED_CPUID) - return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + return __do_cpuid_func_emulated(entry, func, nent, maxnent); - return __do_cpuid_ent(entry, func, idx, nent, maxnent); + return __do_cpuid_func(entry, func, nent, maxnent); } #undef F struct kvm_cpuid_param { u32 func; - u32 idx; bool has_leaf_count; bool (*qualifier)(const struct kvm_cpuid_param *param); }; @@ -836,8 +831,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (ent->qualifier && !ent->qualifier(ent)) continue; - r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent, type); + r = do_cpuid_func(&cpuid_entries[nent], ent->func, + &nent, cpuid->nent, type); if (r) goto out_free; @@ -847,8 +842,8 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) - r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent, type); + r = do_cpuid_func(&cpuid_entries[nent], func, + &nent, cpuid->nent, type); if (r) goto out_free; -- cgit v1.2.3 From 54d360d41211006437bebf97513394693bd32623 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jul 2019 12:18:13 +0200 Subject: KVM: cpuid: extract do_cpuid_7_mask and support multiple subleafs CPUID function 7 has multiple subleafs. Instead of having nested switch statements, move the logic to filter supported features to a separate function, and call it for each subleaf. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 128 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 81 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 85677d9188e3..2e852a87bb69 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -328,6 +328,71 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, return 0; } +static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) +{ + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; + unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; + unsigned f_la57; + + /* cpuid 7.0.ebx */ + const u32 kvm_cpuid_7_0_ebx_x86_features = + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; + + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | + F(MD_CLEAR); + + switch (index) { + case 0: + entry->eax = 0; + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); + /* TSC_ADJUST is emulated */ + entry->ebx |= F(TSC_ADJUST); + + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + f_la57 = entry->ecx & F(LA57); + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* Set LA57 based on hardware capability. */ + entry->ecx |= f_la57; + entry->ecx |= f_umip; + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); + + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); + break; + default: + WARN_ON_ONCE(1); + entry->eax = 0; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } +} + static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, int *nent, int maxnent) { @@ -342,12 +407,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; - unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; - unsigned f_la57 = 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -400,31 +461,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); - /* cpuid 7.0.ebx */ - const u32 kvm_cpuid_7_0_ebx_x86_features = - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; - /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; - /* cpuid 7.0.ecx*/ - const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | - F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | - F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); - - /* cpuid 7.0.edx*/ - const u32 kvm_cpuid_7_0_edx_x86_features = - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | - F(MD_CLEAR); - /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -496,30 +536,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx = 0; entry->edx = 0; break; + /* function 7 has additional index. */ case 7: { + int i; + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - entry->eax = 0; - /* Mask ebx against host capability word 9 */ - entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_7_0_EBX); - // TSC_ADJUST is emulated - entry->ebx |= F(TSC_ADJUST); - entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; - f_la57 = entry->ecx & F(LA57); - cpuid_mask(&entry->ecx, CPUID_7_ECX); - /* Set LA57 based on hardware capability. */ - entry->ecx |= f_la57; - entry->ecx |= f_umip; - /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) - entry->ecx &= ~F(PKU); - entry->edx &= kvm_cpuid_7_0_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_7_EDX); - /* - * We emulate ARCH_CAPABILITIES in software even - * if the host doesn't support it. - */ - entry->edx |= F(ARCH_CAPABILITIES); + for (i = 0; ; ) { + do_cpuid_7_mask(&entry[i], i); + if (i == entry->eax) + break; + if (*nent >= maxnent) + goto out; + + ++i; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } break; } case 9: -- cgit v1.2.3 From d9aadaf689928ba896529cb684729923b21c2de5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jul 2019 12:20:48 +0200 Subject: KVM: cpuid: set struct kvm_cpuid_entry2 flags in do_cpuid_1_ent do_cpuid_1_ent is typically called in two places by __do_cpuid_func for CPUID functions that have subleafs. Both places have to set the KVM_CPUID_FLAG_SIGNIFCANT_INDEX. Set that flag, and KVM_CPUID_FLAG_STATEFUL_FUNC as well, directly in do_cpuid_1_ent. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2e852a87bb69..a46e882ed3bd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -298,6 +298,20 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + + switch (function) { + case 2: + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + break; + case 4: + case 7: + case 0xb: + case 0xd: + case 0x14: + case 0x8000001d: + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + break; + } } static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, @@ -497,14 +511,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, case 2: { int t, times = entry->eax & 0xff; - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times; ++t) { if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; ++*nent; } break; @@ -514,7 +526,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, case 0x8000001d: { int i, cache_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until cache_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -524,8 +535,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, if (!cache_type) break; do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -540,7 +549,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, case 7: { int i; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (i = 0; ; ) { do_cpuid_7_mask(&entry[i], i); if (i == entry->eax) @@ -550,8 +558,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, ++i; do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -595,7 +601,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, case 0xb: { int i, level_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -605,8 +610,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, if (!level_type) break; do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; @@ -619,7 +622,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx = xstate_required_size(supported, false); entry->ecx = entry->ebx; entry->edx &= supported >> 32; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; if (!supported) break; @@ -645,8 +647,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, } entry[i].ecx = 0; entry[i].edx = 0; - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; ++i; } @@ -659,12 +659,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, if (!f_intel_pt) break; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (t = 1; t <= times; ++t) { if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[t], function, t); - entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; -- cgit v1.2.3 From 50a9e1a4b1dece60fd79ecdee25db01274a7f291 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jun 2019 10:29:25 +0200 Subject: KVM: cpuid: rename do_cpuid_1_ent do_cpuid_1_ent does not do the entire processing for a CPUID entry, it only retrieves the host's values. Rename it to match reality. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a46e882ed3bd..e172e543f506 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -289,7 +289,7 @@ static void cpuid_mask(u32 *word, int wordnum) *word &= boot_cpu_data.x86_capability[wordnum]; } -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { entry->function = function; @@ -487,7 +487,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, if (*nent >= maxnent) goto out; - do_cpuid_1_ent(entry, function, 0); + do_host_cpuid(entry, function, 0); ++*nent; switch (function) { @@ -516,7 +516,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, if (*nent >= maxnent) goto out; - do_cpuid_1_ent(&entry[t], function, 0); + do_host_cpuid(&entry[t], function, 0); ++*nent; } break; @@ -534,7 +534,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, cache_type = entry[i - 1].eax & 0x1f; if (!cache_type) break; - do_cpuid_1_ent(&entry[i], function, i); + do_host_cpuid(&entry[i], function, i); ++*nent; } break; @@ -557,7 +557,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, goto out; ++i; - do_cpuid_1_ent(&entry[i], function, i); + do_host_cpuid(&entry[i], function, i); ++*nent; } break; @@ -609,7 +609,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; - do_cpuid_1_ent(&entry[i], function, i); + do_host_cpuid(&entry[i], function, i); ++*nent; } break; @@ -630,7 +630,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, if (*nent >= maxnent) goto out; - do_cpuid_1_ent(&entry[i], function, idx); + do_host_cpuid(&entry[i], function, idx); if (idx == 1) { entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; cpuid_mask(&entry[i].eax, CPUID_D_1_EAX); @@ -662,7 +662,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, for (t = 1; t <= times; ++t) { if (*nent >= maxnent) goto out; - do_cpuid_1_ent(&entry[t], function, t); + do_host_cpuid(&entry[t], function, t); ++*nent; } break; -- cgit v1.2.3 From 60cec433c485564bd7caac38a9df5c1ed79ee560 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jun 2019 10:32:57 +0200 Subject: KVM: cpuid: remove has_leaf_count from struct kvm_cpuid_param The has_leaf_count member was originally added for KVM's paravirtualization CPUID leaves. However, since then the leaf count _has_ been added to those leaves as well, so we can drop that special case. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e172e543f506..2238145f23df 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -791,7 +791,6 @@ static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, struct kvm_cpuid_param { u32 func; - bool has_leaf_count; bool (*qualifier)(const struct kvm_cpuid_param *param); }; @@ -835,11 +834,10 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, int limit, nent = 0, r = -E2BIG, i; u32 func; static const struct kvm_cpuid_param param[] = { - { .func = 0, .has_leaf_count = true }, - { .func = 0x80000000, .has_leaf_count = true }, - { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, + { .func = 0 }, + { .func = 0x80000000 }, + { .func = 0xC0000000, .qualifier = is_centaur_cpu }, { .func = KVM_CPUID_SIGNATURE }, - { .func = KVM_CPUID_FEATURES }, }; if (cpuid->nent < 1) @@ -869,9 +867,6 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (r) goto out_free; - if (!ent->has_leaf_count) - continue; - limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) r = do_cpuid_func(&cpuid_entries[nent], func, -- cgit v1.2.3 From 43fdcda96e2550c6d1c46fb8a78801aa2f7276ed Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 3 Jan 2019 16:22:21 -0800 Subject: kvm: x86: Do not release the page inside mmu_set_spte() Release the page at the call-site where it was originally acquired. This makes the exit code cleaner for most call sites, since they do not need to duplicate code between success and the failure label. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 18 +++++++----------- arch/x86/kvm/paging_tmpl.h | 8 +++----- 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 771349e72d2a..6fc5c389f5a1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3095,8 +3095,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return ret; } @@ -3131,9 +3129,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret <= 0) return -1; - for (i = 0; i < ret; i++, gfn++, start++) + for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); + put_page(pages[i]); + } return 0; } @@ -3530,6 +3530,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -3538,14 +3539,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -4159,6 +4157,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -4167,14 +4166,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 367a47df4ba0..2db96401178e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -543,6 +543,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + kvm_release_pfn_clean(pfn); return true; } @@ -694,7 +695,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, return ret; out_gpte_changed: - kvm_release_pfn_clean(pfn); return RET_PF_RETRY; } @@ -842,6 +842,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.pte_access &= ~ACC_EXEC_MASK; } + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -855,14 +856,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) -- cgit v1.2.3 From 3fcf2d1bdeb6a513523cb2c77012a6b047aa859c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jun 2019 13:06:21 +0200 Subject: KVM: x86: make FNAME(fetch) and __direct_map more similar These two functions are basically doing the same thing through kvm_mmu_get_page, link_shadow_page and mmu_set_spte; yet, for historical reasons, their code looks very different. This patch tries to take the best of each and make them very similar, so that it is easy to understand changes that apply to both of them. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 53 ++++++++++++++++++++++------------------------ arch/x86/kvm/paging_tmpl.h | 30 ++++++++++++-------------- 2 files changed, 39 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6fc5c389f5a1..af9dafa54f85 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3181,40 +3181,39 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, + bool prefault) { - struct kvm_shadow_walk_iterator iterator; + struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + gfn_t base_gfn = gfn; if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return 0; + return RET_PF_RETRY; - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, - map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; + for_each_shadow_entry(vcpu, gpa, it) { + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) break; - } - drop_large_spte(vcpu, iterator.sptep); - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; + drop_large_spte(vcpu, it.sptep); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, 1, ACC_ALL); - - link_shadow_page(vcpu, iterator.sptep, sp); + link_shadow_page(vcpu, it.sptep, sp); } } - return emulate; + + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, + write, level, base_gfn, pfn, prefault, + map_writable); + direct_pte_prefetch(vcpu, it.sptep); + ++vcpu->stat.pf_fixed; + return ret; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -3538,8 +3537,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - + r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4165,8 +4163,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2db96401178e..bfd89966832b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -623,6 +623,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; + gfn_t base_gfn; direct_access = gw->pte_access; @@ -667,31 +668,29 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - for (; - shadow_walk_okay(&it) && it.level > hlevel; - shadow_walk_next(&it)) { - gfn_t direct_gfn; + base_gfn = gw->gfn; + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == hlevel) + break; + validate_direct_spte(vcpu, it.sptep, direct_access); drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) - continue; - - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + it.level - 1, true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); + } } - clear_sp_write_flooding_count(it.sptep); ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + it.level, base_gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - + ++vcpu->stat.pf_fixed; return ret; out_gpte_changed: @@ -854,7 +853,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault); - ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: -- cgit v1.2.3 From d679b32611c0102ce33b9e1a4e4b94854ed1812a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 23 Jun 2019 19:15:49 +0200 Subject: KVM: x86: remove now unneeded hugepage gfn adjustment After the previous patch, the low bits of the gfn are masked in both FNAME(fetch) and __direct_map, so we do not need to clear them in transparent_hugepage_adjust. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 9 +++------ arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index af9dafa54f85..084c1a0d9f98 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3240,11 +3240,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; int level = *levelp; /* @@ -3271,8 +3270,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; kvm_get_pfn(pfn); @@ -3536,7 +3533,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -4162,7 +4159,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bfd89966832b..f39b381a8b88 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -850,7 +850,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); -- cgit v1.2.3 From e9f2a760b158551bfbef6db31d2cae45ab8072e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 30 Jun 2019 08:36:21 -0400 Subject: KVM: x86: change kvm_mmu_page_get_gfn BUG_ON to WARN_ON Note that in such a case it is quite likely that KVM will BUG_ON in __pte_list_remove when the VM is closed. However, there is no immediate risk of memory corruption in the host so a WARN_ON is enough and it lets you gather traces for debugging. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 084c1a0d9f98..0629a89bb070 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1098,10 +1098,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else + if (!sp->role.direct) { sp->gfns[index] = gfn; + return; + } + + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) + pr_err_ratelimited("gfn mismatch under direct page %llx " + "(expected %llx, got %llx)\n", + sp->gfn, + kvm_mmu_page_get_gfn(sp, index), gfn); } /* -- cgit v1.2.3 From 335e192a3fa415e1202c8b9ecdaaecd643f823cc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 1 Jul 2019 06:22:57 -0400 Subject: KVM: x86: add tracepoints around __direct_map and FNAME(fetch) These are useful in debugging shadow paging. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 13 +++++----- arch/x86/kvm/mmutrace.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 2 ++ 3 files changed, 67 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0629a89bb070..6248c39a33ef 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -143,9 +143,6 @@ module_param(dbg, bool, 0644); #include -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -269,9 +266,13 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static u8 __read_mostly shadow_phys_bits; static void mmu_spte_set(u64 *sptep, u64 spte); +static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + static inline bool kvm_available_flush_tlb_with_range(void) { @@ -3086,10 +3087,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, - *sptep, sptep); + trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -3200,6 +3198,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return RET_PF_RETRY; + trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index dd30dccd2ad5..d8001b4bca05 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -301,6 +301,65 @@ TRACE_EVENT( __entry->kvm_gen == __entry->spte_gen ) ); + +TRACE_EVENT( + kvm_mmu_set_spte, + TP_PROTO(int level, gfn_t gfn, u64 *sptep), + TP_ARGS(level, gfn, sptep), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(u64, sptep) + __field(u8, level) + /* These depend on page entry type, so compute them now. */ + __field(bool, r) + __field(bool, x) + __field(u8, u) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = *sptep; + __entry->sptep = virt_to_phys(sptep); + __entry->level = level; + __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); + __entry->x = is_executable_pte(__entry->spte); + __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + ), + + TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", + __entry->gfn, __entry->spte, + __entry->r ? "r" : "-", + __entry->spte & PT_WRITABLE_MASK ? "w" : "-", + __entry->x ? "x" : "-", + __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->level, __entry->sptep + ) +); + +TRACE_EVENT( + kvm_mmu_spte_requested, + TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), + TP_ARGS(addr, level, pfn), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, pfn) + __field(u8, level) + ), + + TP_fast_assign( + __entry->gfn = addr >> PAGE_SHIFT; + __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + __entry->level = level; + ), + + TP_printk("gfn %llx pfn %llx level %d", + __entry->gfn, __entry->pfn, __entry->level + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f39b381a8b88..e9d110fdcb8e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -670,6 +670,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, base_gfn = gw->gfn; + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); -- cgit v1.2.3 From f087a02941feacf7d6f097522bc67c602fda18e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 7 Jun 2019 11:55:34 -0700 Subject: KVM: nVMX: Stash L1's CR3 in vmcs01.GUEST_CR3 on nested entry w/o EPT KVM does not have 100% coverage of VMX consistency checks, i.e. some checks that cause VM-Fail may only be detected by hardware during a nested VM-Entry. In such a case, KVM must restore L1's state to the pre-VM-Enter state as L2's state has already been loaded into KVM's software model. L1's CR3 and PDPTRs in particular are loaded from vmcs01.GUEST_*. But when EPT is disabled, the associated fields hold KVM's shadow values, not L1's "real" values. Fortunately, when EPT is disabled the PDPTRs come from memory, i.e. are not cached in the VMCS. Which leaves CR3 as the sole anomaly. A previously applied workaround to handle CR3 was to force nested early checks if EPT is disabled: commit 2b27924bb1d48 ("KVM: nVMX: always use early vmcs check when EPT is disabled") Forcing nested early checks is undesirable as doing so adds hundreds of cycles to every nested VM-Entry. Rather than take this performance hit, handle CR3 by overwriting vmcs01.GUEST_CR3 with L1's CR3 during nested VM-Entry when EPT is disabled *and* nested early checks are disabled. By stuffing vmcs01.GUEST_CR3, nested_vmx_restore_host_state() will naturally restore the correct vcpu->arch.cr3 from vmcs01.GUEST_CR3. These shenanigans work because nested_vmx_restore_host_state() does a full kvm_mmu_reset_context(), i.e. unloads the current MMU, which guarantees vmcs01.GUEST_CR3 will be rewritten with a new shadow CR3 prior to re-entering L1. vcpu->arch.root_mmu.root_hpa is set to INVALID_PAGE via: nested_vmx_restore_host_state() -> kvm_mmu_reset_context() -> kvm_mmu_unload() -> kvm_mmu_free_roots() kvm_mmu_unload() has WARN_ON(root_hpa != INVALID_PAGE), i.e. we can bank on 'root_hpa == INVALID_PAGE' unless the implementation of kvm_mmu_reset_context() is changed. On the way into L1, VMCS.GUEST_CR3 is guaranteed to be written (on a successful entry) via: vcpu_enter_guest() -> kvm_mmu_reload() -> kvm_mmu_load() -> kvm_mmu_load_cr3() -> vmx_set_cr3() Stuff vmcs01.GUEST_CR3 if and only if nested early checks are disabled as a "late" VM-Fail should never happen win that case (KVM WARNs), and the conditional write avoids the need to restore the correct GUEST_CR3 when nested_vmx_check_vmentry_hw() fails. Signed-off-by: Sean Christopherson Message-Id: <20190607185534.24368-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/vmx.h | 1 - arch/x86/kvm/vmx/nested.c | 44 +++++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d213ec5c3766..f0b0c90dd398 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -146,7 +146,6 @@ #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 -#define VMX_ABORT_VMCS_CORRUPTED 3 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 118d185764ec..d125304ae2c9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2978,6 +2978,25 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + /* + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* + * nested early checks are disabled. In the event of a "late" VM-Fail, + * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its + * software model to the pre-VMEntry host state. When EPT is disabled, + * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes + * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing + * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to + * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested + * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is + * guaranteed to be overwritten with a shadow CR3 prior to re-entering + * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as + * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks + * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail + * path would need to manually save/restore vmcs01.GUEST_CR3. + */ + if (!enable_ept && !nested_early_check) + vmcs_writel(GUEST_CR3, vcpu->arch.cr3); + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); prepare_vmcs02_early(vmx, vmcs12); @@ -3869,18 +3888,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); - - /* - * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 - * points to shadow pages! Fortunately we only get here after a WARN_ON - * if EPT is disabled, so a VMabort is perfectly fine. - */ - if (enable_ept) { - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - } else { - nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); - } + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -3888,7 +3897,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ - ept_save_pdptrs(vcpu); + if (enable_ept) + ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); @@ -5889,14 +5899,6 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; - /* - * Without EPT it is not possible to restore L1's CR3 and PDPTR on - * VMfail, because they are not available in vmcs01. Just always - * use hardware checks. - */ - if (!enable_ept) - nested_early_check = 1; - if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { -- cgit v1.2.3 From 1ef23e1f16088f5a91b25bb17585c2d532a097f7 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 3 Jul 2019 19:54:35 -0400 Subject: KVM nVMX: Check Host Segment Registers and Descriptor Tables on vmentry of nested guests According to section "Checks on Host Segment and Descriptor-Table Registers" in Intel SDM vol 3C, the following checks are performed on vmentry of nested guests: - In the selector field for each of CS, SS, DS, ES, FS, GS and TR, the RPL (bits 1:0) and the TI flag (bit 2) must be 0. - The selector fields for CS and TR cannot be 0000H. - The selector field for SS cannot be 0000H if the "host address-space size" VM-exit control is 0. - On processors that support Intel 64 architecture, the base-address fields for FS, GS and TR must contain canonical addresses. Signed-off-by: Krish Sadhukhan Reviewed-by: Karl Heubaum Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d125304ae2c9..6dd20e0ad2fa 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2627,6 +2627,30 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, !kvm_pat_valid(vmcs12->host_ia32_pat)) return -EINVAL; + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + + if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_cs_selector == 0 || + vmcs12->host_tr_selector == 0 || + (vmcs12->host_ss_selector == 0 && !ia32e)) + return -EINVAL; + +#ifdef CONFIG_X86_64 + if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) || + is_noncanonical_address(vmcs12->host_gs_base, vcpu) || + is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) || + is_noncanonical_address(vmcs12->host_idtr_base, vcpu) || + is_noncanonical_address(vmcs12->host_tr_base, vcpu)) + return -EINVAL; +#endif + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2634,8 +2658,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, * the host address-space size VM-exit control. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) -- cgit v1.2.3 From 101628ded5d990f7f364bd4d371ada9b2beaae64 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 5 Jul 2019 14:14:15 +0200 Subject: KVM: LAPIC: ARBPRI is a reserved register for x2APIC kvm-unit-tests were adjusted to match bare metal behavior, but KVM itself was not doing what bare metal does; fix that. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d6ca5c4f29f1..2e4470f2685a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1318,7 +1318,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, unsigned char alignment = offset & 0xf; u32 result; /* this bitmask has a bit cleared for each reserved register */ - static const u64 rmask = 0x43ff01ffffffe70cULL; + u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { apic_debug("KVM_APIC_READ: alignment error %x %d\n", @@ -1326,6 +1326,10 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, return 1; } + /* ARBPRI is also reserved on x2APIC */ + if (apic_x2apic_mode(apic)) + rmask &= ~(1 << (APIC_ARBPRI >> 4)); + if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { apic_debug("KVM_APIC_READ: read reserved register %x\n", offset); -- cgit v1.2.3 From 01402cf81051f796dac7c60ca11d6147153ca46a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 5 Jul 2019 14:57:58 +0200 Subject: kvm: LAPIC: write down valid APIC registers Replace a magic 64-bit mask with a list of valid registers, computing the same mask in the end. Suggested-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2e4470f2685a..e4227ceab0c6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1312,25 +1312,45 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_lapic, dev); } +#define APIC_REG_MASK(reg) (1ull << ((reg) >> 4)) +#define APIC_REGS_MASK(first, count) \ + (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) + int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { unsigned char alignment = offset & 0xf; u32 result; /* this bitmask has a bit cleared for each reserved register */ - u64 rmask = 0x43ff01ffffffe70cULL; - - if ((alignment + len) > 4) { - apic_debug("KVM_APIC_READ: alignment error %x %d\n", - offset, len); - return 1; - } - - /* ARBPRI is also reserved on x2APIC */ - if (apic_x2apic_mode(apic)) - rmask &= ~(1 << (APIC_ARBPRI >> 4)); + u64 valid_reg_mask = + APIC_REG_MASK(APIC_ID) | + APIC_REG_MASK(APIC_LVR) | + APIC_REG_MASK(APIC_TASKPRI) | + APIC_REG_MASK(APIC_PROCPRI) | + APIC_REG_MASK(APIC_LDR) | + APIC_REG_MASK(APIC_DFR) | + APIC_REG_MASK(APIC_SPIV) | + APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) | + APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) | + APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) | + APIC_REG_MASK(APIC_ESR) | + APIC_REG_MASK(APIC_ICR) | + APIC_REG_MASK(APIC_ICR2) | + APIC_REG_MASK(APIC_LVTT) | + APIC_REG_MASK(APIC_LVTTHMR) | + APIC_REG_MASK(APIC_LVTPC) | + APIC_REG_MASK(APIC_LVT0) | + APIC_REG_MASK(APIC_LVT1) | + APIC_REG_MASK(APIC_LVTERR) | + APIC_REG_MASK(APIC_TMICT) | + APIC_REG_MASK(APIC_TMCCT) | + APIC_REG_MASK(APIC_TDCR); + + /* ARBPRI is not valid on x2APIC */ + if (!apic_x2apic_mode(apic)) + valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); - if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { + if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) { apic_debug("KVM_APIC_READ: read reserved register %x\n", offset); return 1; -- cgit v1.2.3 From 548f7fb22234c6fe13c64459059fbd42058953c4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 5 Jul 2019 23:23:42 +0800 Subject: KVM: LAPIC: Retry tune per-vCPU timer_advance_ns if adaptive tuning goes insane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retry tune per-vCPU timer_advance_ns if adaptive tuning goes insane which can happen sporadically in product environment. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e4227ceab0c6..e1a2e2d467ca 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -71,6 +71,7 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 +#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1546,8 +1547,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) apic->lapic_timer.timer_advance_adjust_done = true; if (unlikely(timer_advance_ns > 5000)) { - timer_advance_ns = 0; - apic->lapic_timer.timer_advance_adjust_done = true; + timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_adjust_done = false; } apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -2344,7 +2345,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; apic->lapic_timer.timer_advance_adjust_done = false; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; -- cgit v1.2.3 From cdc238eb72f6b94b6c33b98c07b9fc3ac5e57b18 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 10 Jul 2019 08:24:03 +0800 Subject: kvm: x86: Fix -Wmissing-prototypes warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get a warning when build kernel W=1: arch/x86/kvm/../../../virt/kvm/eventfd.c:48:1: warning: no previous prototype for ‘kvm_arch_irqfd_allowed’ [-Wmissing-prototypes] kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) ^ The reason is kvm_arch_irqfd_allowed() is declared in arch/x86/kvm/irq.h, which is not included by eventfd.c. Considering kvm_arch_irqfd_allowed() is a weakly defined function in eventfd.c, remove the declaration to kvm_host.h can fix this. Signed-off-by: Yi Wang Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq.h | 1 - include/linux/kvm_host.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index fd210cdd4983..d5005cc26521 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -114,7 +114,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return mode != KVM_IRQCHIP_NONE; } -bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index abafddb9fe2c..b91829ee3db1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -993,6 +993,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); +bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* * search_memslots() and __gfn_to_memslot() are here because they are -- cgit v1.2.3 From 66bb8a065f5aedd4551d8d3fbce582972f65c2e1 Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Wed, 10 Jul 2019 18:25:15 -0700 Subject: KVM: x86: PMU Event Filter Some events can provide a guest with information about other guests or the host (e.g. L3 cache stats); providing the capability to restrict access to a "safe" set of events would limit the potential for the PMU to be used in any side channel attacks. This change introduces a new VM ioctl that sets an event filter. If the guest attempts to program a counter for any blacklisted or non-whitelisted event, the kernel counter won't be created, so any RDPMC/RDMSR will show 0 instances of that event. Signed-off-by: Eric Hankland [Lots of changes. All remaining bugs are probably mine. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 26 ++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm.h | 10 +++++++ arch/x86/kvm/pmu.c | 63 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/x86.c | 5 ++++ include/uapi/linux/kvm.h | 3 ++ 7 files changed, 110 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 91fd86fcc49f..38b0d4451a24 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4065,6 +4065,32 @@ KVM_ARM_VCPU_FINALIZE call. See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization using this ioctl. +4.120 KVM_SET_PMU_EVENT_FILTER + +Capability: KVM_CAP_PMU_EVENT_FILTER +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pmu_event_filter (in) +Returns: 0 on success, -1 on error + +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u64 events[0]; +}; + +This ioctl restricts the set of PMU events that the guest can program. +The argument holds a list of events which will be allowed or denied. +The eventsel+umask of each event the guest attempts to program is compared +against the events field to determine whether the guest should have access. +This only affects general purpose counters; fixed purpose counters can +be disabled by changing the perfmon CPUID leaf. + +Valid values for 'action': +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f46a12a5cf2e..34d017bd1d1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -933,6 +933,8 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + + struct kvm_pmu_event_filter *pmu_event_filter; }; struct kvm_vm_stat { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index f9b021e16ebc..46588f5d6283 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -422,4 +422,14 @@ struct kvm_nested_state { __u8 data[0]; }; +/* for KVM_CAP_PMU_EVENT_FILTER */ +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u64 events[0]; +}; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index dd745b58ffd8..9d92c4d3cd44 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -22,6 +22,9 @@ #include "lapic.h" #include "pmu.h" +/* This keeps the total size of the filter under 4k. */ +#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63 + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -144,6 +147,10 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu_event_filter *filter; + int i; + bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -155,6 +162,22 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (filter) { + for (i = 0; i < filter->nevents; i++) + if (filter->events[i] == + (eventsel & AMD64_RAW_EVENT_MASK_NB)) + break; + if (filter->action == KVM_PMU_EVENT_ALLOW && + i == filter->nevents) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_DENY && + i < filter->nevents) + allow_event = false; + } + if (!allow_event) + return; + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; @@ -351,3 +374,43 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); } + +int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_pmu_event_filter tmp, *filter; + size_t size; + int r; + + if (copy_from_user(&tmp, argp, sizeof(tmp))) + return -EFAULT; + + if (tmp.action != KVM_PMU_EVENT_ALLOW && + tmp.action != KVM_PMU_EVENT_DENY) + return -EINVAL; + + if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) + return -E2BIG; + + size = struct_size(filter, events, tmp.nevents); + filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + if (!filter) + return -ENOMEM; + + r = -EFAULT; + if (copy_from_user(filter, argp, size)) + goto cleanup; + + /* Ensure nevents can't be changed between the user copies. */ + *filter = tmp; + + mutex_lock(&kvm->lock); + rcu_swap_protected(kvm->arch.pmu_event_filter, filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); + + synchronize_srcu_expedited(&kvm->srcu); + r = 0; +cleanup: + kfree(filter); + return r; +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 22dff661145a..58265f761c3b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -118,6 +118,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e302e977dac..81faceba8cec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3132,6 +3132,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -4978,6 +4979,9 @@ set_identity_unlock: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } + case KVM_SET_PMU_EVENT_FILTER: + r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -9428,6 +9432,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_hv_destroy_vm(kvm); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c2152f3dd02d..a7c19540ce21 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -995,6 +995,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SVE 170 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172 +#define KVM_CAP_PMU_EVENT_FILTER 173 #ifdef KVM_CAP_IRQ_ROUTING @@ -1329,6 +1330,8 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) +/* Available with KVM_CAP_PMU_EVENT_FILTER */ +#define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) -- cgit v1.2.3 From d7a08882a0a4b4e176691331ee3f492996579534 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 10 Jul 2019 09:07:34 -0700 Subject: KVM: x86: Unconditionally enable irqs in guest context On VMX, KVM currently does not re-enable irqs until after it has exited the guest context. As a result, a tick that fires in the window between VM-Exit and guest_exit_irqoff() will be accounted as system time. While said window is relatively small, it's large enough to be problematic in some configurations, e.g. if VM-Exits are consistently occurring a hair earlier than the tick irq. Intentionally toggle irqs back off so that guest_exit_irqoff() can be used in lieu of guest_exit() in order to avoid the save/restore of flags in guest_exit(). On my Haswell system, "nop; cli; sti" is ~6 cycles, versus ~28 cycles for "pushf; pop ; cli; push ; popf". Fixes: f2485b3e0c6c0 ("KVM: x86: use guest_exit_irqoff") Reported-by: Wei Yang Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +--------- arch/x86/kvm/x86.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5270711e787f..98b848fcf3e3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6184,15 +6184,7 @@ out: static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - kvm_before_interrupt(vcpu); - local_irq_enable(); - /* - * We must have an instruction with interrupts enabled, so - * the timer interrupt isn't delayed by the interrupt shadow. - */ - asm("nop"); - local_irq_disable(); - kvm_after_interrupt(vcpu); + } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 81faceba8cec..0347b1e41c40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8046,7 +8046,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->handle_exit_irqoff(vcpu); + /* + * Consume any pending interrupts, including the possible source of + * VM-Exit on SVM and any ticks that occur between VM-Exit and now. + * An instruction is required after local_irq_enable() to fully unblock + * interrupts on processors that implement an interrupt shadow, the + * stat.exits increment will do nicely. + */ + kvm_before_interrupt(vcpu); + local_irq_enable(); ++vcpu->stat.exits; + local_irq_disable(); + kvm_after_interrupt(vcpu); guest_exit_irqoff(); if (lapic_in_kernel(vcpu)) { -- cgit v1.2.3