From 98152b83e065d5593c261b0cc58197666f94a34f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 28 Aug 2017 16:38:35 +0200 Subject: KVM: x86: Remove .get_pkru() from kvm_x86_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 9dd21e104bc ('KVM: x86: simplify handling of PKRU') removed all users and providers of that call-back, but didn't remove it. Remove it now. Signed-off-by: Joerg Roedel Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8844eee290b2..bbb802382a3e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -951,7 +951,6 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 49a8afca386ee1775519a4aa80f8e121bd227dd4 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Tue, 5 Sep 2017 23:58:44 +0200 Subject: KVM: SVM: Add a missing 'break' statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jan H. Schönherr Fixes: f6511935f424 ("KVM: SVM: Add checks for IO instructions") Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2c1cfe68a9af..af54327b9017 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5302,6 +5302,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, */ if (info->rep_prefix != REPE_PREFIX) goto out; + break; case SVM_EXIT_IOIO: { u64 exit_info; u32 bytes; -- cgit v1.2.3 From a05950009f50ca971a1d616655d01628177bd2e3 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Wed, 6 Sep 2017 00:27:19 +0200 Subject: KVM: x86: Fix handling of pending signal on uninitialized AP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM API says that KVM_RUN will return with -EINTR when a signal is pending. However, if a vCPU is in KVM_MP_STATE_UNINITIALIZED, then the return value is unconditionally -EAGAIN. Copy over some code from vcpu_run(), so that the case of a pending signal results in the expected return value. Signed-off-by: Jan H. Schönherr Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6069af86da3b..b27f7f0020e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7235,6 +7235,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_apic_accept_events(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; + if (signal_pending(current)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } goto out; } -- cgit v1.2.3 From 2f173d2688559a6f85643d38a2ad6f45eb420c42 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Wed, 6 Sep 2017 18:34:06 +0200 Subject: KVM: x86: Fix immediate_exit handling for uninitialized AP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When user space sets kvm_run->immediate_exit, KVM is supposed to return quickly. However, when a vCPU is in KVM_MP_STATE_UNINITIALIZED, the value is not considered and the vCPU blocks. Fix that oversight. Fixes: 460df4c1fc7c008 ("KVM: race-free exit from KVM_RUN without POSIX signals") Signed-off-by: Jan H. Schönherr Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b27f7f0020e3..c777a8f681a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7231,6 +7231,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + if (kvm_run->immediate_exit) { + r = -EINTR; + goto out; + } kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); -- cgit v1.2.3 From 5153723388c8691b886eb9ad28dde92c01f9b191 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 13 Sep 2017 15:13:46 +0200 Subject: KVM: x86: fix clang build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang resolves __builtin_constant_p() to false even if the expression is constant in the end. The only purpose of that expression was to differentiate a case where the following expression couldn't be checked at compile-time, so we can just remove the check. Clang handles the following two correctly. Turn it into BUG_ON if there are any more problems with this. Fixes: d6321d493319 ("KVM: x86: generalize guest_cpuid_has_ helpers") Reported-by: Dmitry Vyukov Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 1ea3c0e1e3a9..0bc5c1315708 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; - BUILD_BUG_ON(!__builtin_constant_p(x86_leaf)); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); -- cgit v1.2.3 From dfa20099e26e35cd9e83d4d43172615c73c4e9f5 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 12 Sep 2017 10:42:40 -0500 Subject: KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparing the base code for subsequent changes. This does not change existing logic. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af54327b9017..cab020acb509 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1600,6 +1600,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } +static int avic_init_vcpu(struct vcpu_svm *svm) +{ + int ret; + + if (!avic) + return 0; + + ret = avic_init_backing_page(&svm->vcpu); + if (ret) + return ret; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + + return ret; +} + static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; @@ -1636,14 +1653,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; - if (avic) { - err = avic_init_backing_page(&svm->vcpu); - if (err) - goto free_page4; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); - } + err = avic_init_vcpu(svm); + if (err) + goto free_page4; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. -- cgit v1.2.3 From b2a05feff274e007abd7cda0d2cdae7fdf5cbb0a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 12 Sep 2017 10:42:41 -0500 Subject: KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify struct kvm_x86_ops.arch.apicv_active() to take struct kvm_vcpu pointer as parameter in preparation to subsequent changes. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bbb802382a3e..c73e493adf07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -972,7 +972,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(void); + bool (*get_enable_apicv)(struct kvm_vcpu *vcpu); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cab020acb509..76f559f5a3dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4407,7 +4407,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static bool svm_get_enable_apicv(void) +static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) { return avic; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4253adef9044..09204993a739 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ } } -static bool vmx_get_enable_apicv(void) +static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c777a8f681a8..f4a978e36030 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7980,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) -- cgit v1.2.3 From 67034bb9dd5ecc0171bdacfbd31ebf7c2634df98 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 12 Sep 2017 10:42:42 -0500 Subject: KVM: SVM: Add irqchip_split() checks before enabling AVIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SVM AVIC hardware accelerates guest write to APIC_EOI register (for edge-trigger interrupt), which means it does not trap to KVM. So, only enable SVM AVIC only in split irqchip mode. (e.g. launching qemu w/ option '-machine kernel_irqchip=split'). Suggested-by: Paolo Bonzini Signed-off-by: Suravee Suthikulpanit Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support") [Removed pr_debug - Radim.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 76f559f5a3dd..0e68f0b3cbf7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - svm->vcpu.arch.apicv_active = true; } static void init_vmcb(struct vcpu_svm *svm) @@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } - if (avic) + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); /* @@ -1604,7 +1603,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) { int ret; - if (!avic) + if (!kvm_vcpu_apicv_active(&svm->vcpu)) return 0; ret = avic_init_backing_page(&svm->vcpu); @@ -4409,7 +4408,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) { - return avic; + return avic && irqchip_split(vcpu->kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -4426,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!avic) + if (!kvm_vcpu_apicv_active(&svm->vcpu)) return; vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; -- cgit v1.2.3 From a5f01f8e9756b40a56e351f98b3fae6d235b834f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 13 Sep 2017 04:04:01 -0700 Subject: KVM: X86: Don't block vCPU if there is pending exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't block vCPU if there is pending exception. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4a978e36030..bfda79fb102c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8461,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; + if (vcpu->arch.exception.pending) + return true; + if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu))) -- cgit v1.2.3 From 9a6e7c39810e4a8bc7fc95056cefb40583fe07ef Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Sep 2017 03:54:16 -0700 Subject: KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu-system-x86-8600 [004] d..1 7205.687530: kvm_entry: vcpu 2 qemu-system-x86-8600 [004] .... 7205.687532: kvm_exit: reason EXCEPTION_NMI rip 0xffffffffa921297d info ffffeb2c0e44e018 80000b0e qemu-system-x86-8600 [004] .... 7205.687532: kvm_page_fault: address ffffeb2c0e44e018 error_code 0 qemu-system-x86-8600 [004] .... 7205.687620: kvm_try_async_get_page: gva = 0xffffeb2c0e44e018, gfn = 0x427e4e qemu-system-x86-8600 [004] .N.. 7205.687628: kvm_async_pf_not_present: token 0x8b002 gva 0xffffeb2c0e44e018 kworker/4:2-7814 [004] .... 7205.687655: kvm_async_pf_completed: gva 0xffffeb2c0e44e018 address 0x7fcc30c4e000 qemu-system-x86-8600 [004] .... 7205.687703: kvm_async_pf_ready: token 0x8b002 gva 0xffffeb2c0e44e018 qemu-system-x86-8600 [004] d..1 7205.687711: kvm_entry: vcpu 2 After running some memory intensive workload in guest, I catch the kworker which completes the GUP too quickly, and queues an "Page Ready" #PF exception after the "Page not Present" exception before the next vmentry as the above trace which will result in #DF injected to guest. This patch fixes it by clearing the queue for "Page not Present" if "Page Ready" occurs before the next vmentry since the GUP has already got the required page and shadow page table has already been fixed by "Page Ready" handler. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Fixes: 7c90705bf2a3 ("KVM: Inject asynchronous page fault into a PV guest if page is swapped out.") [Changed indentation and added clearing of injected. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfda79fb102c..cd17b7d9a107 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8631,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) sizeof(val)); } +static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, + sizeof(u32)); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -8658,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; + u32 val; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -8665,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->gva); - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); + if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && + !apf_get_user(vcpu, &val)) { + if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && + vcpu->arch.exception.pending && + vcpu->arch.exception.nr == PF_VECTOR && + !apf_put_user(vcpu, 0)) { + vcpu->arch.exception.injected = false; + vcpu->arch.exception.pending = false; + vcpu->arch.exception.nr = 0; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.error_code = 0; + } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + fault.async_page_fault = true; + kvm_inject_page_fault(vcpu, &fault); + } } vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -- cgit v1.2.3 From 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 12 Sep 2017 13:02:54 -0700 Subject: kvm: nVMX: Don't allow L2 to access the hardware CR8 If L1 does not specify the "use TPR shadow" VM-execution control in vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store exiting" VM-execution controls in vmcs02. Failure to do so will give the L2 VM unrestricted read/write access to the hardware CR8. This fixes CVE-2017-12154. Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 09204993a739..8e1ae716f938 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10525,6 +10525,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & CPU_BASED_TPR_SHADOW) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } /* -- cgit v1.2.3 From 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Thu, 7 Sep 2017 19:02:30 +0100 Subject: KVM: VMX: Do not BUG() on out-of-bounds guest IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of the guest_irq argument to vmx_update_pi_irte() is ultimately coming from a KVM_IRQFD API call. Do not BUG() in vmx_update_pi_irte() if the value is out-of bounds. (Especially, since KVM as a whole seems to hang after that.) Instead, print a message only once if we find that we don't have a route for a certain IRQ (which can be out-of-bounds or within the array). This fixes CVE-2017-1000252. Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts") Signed-off-by: Jan H. Schönherr Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e1ae716f938..0b15b43ef45d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11834,7 +11834,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11843,7 +11843,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) -- cgit v1.2.3 From cc1b46803a671047be22f7832ef4a2bb3f63dd94 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:20 -0700 Subject: kvm,lapic: Justify use of swait_active() A comment might serve future readers. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index aaf10b6f5380..69c5612be786 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); + /* + * For x86, the atomic_inc() is serialized, thus + * using swait_active() is safe. + */ if (swait_active(q)) swake_up(q); -- cgit v1.2.3 From a0cff57bb2a41cb9cbf13d3203097b4156d8c0ae Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:21 -0700 Subject: kvm,x86: Fix apf_task_wake_one() wq serialization During code inspection, the following potential race was seen: CPU0 CPU1 kvm_async_pf_task_wait apf_task_wake_one [L] swait_active(&n->wq) [S] prepare_to_swait(&n.wq) [L] if (!hlist_unhahed(&n.link)) schedule() [S] hlist_del_init(&n->link); Properly serialize swait_active() checks such that a wakeup is not missed. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 874827b0d7ca..aa60a08b65b1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (swait_active(&n->wq)) + else if (swq_has_sleeper(&n->wq)) swake_up(&n->wq); } -- cgit v1.2.3 From 7881f96cac4d420c94e62a4e1eea243899a7052e Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Sep 2017 16:31:40 -0700 Subject: kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry After a successful VM-entry, RFLAGS is cleared, with the exception of bit 1, which is always set. This is handled by load_vmcs12_host_state. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0b15b43ef45d..29f85ed5a329 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11491,16 +11491,18 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - /* - * Exiting from L2 to L1, we're now back to L1 which thinks it just - * finished a VMLAUNCH or VMRESUME instruction, so we need to set the - * success or failure flag accordingly. - */ if (unlikely(vmx->fail)) { + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly. + */ vmx->fail = 0; nested_vmx_failValid(vcpu, vm_inst_error); - } else - nested_vmx_succeed(vcpu); + } + if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; -- cgit v1.2.3 From b060ca3b2e9e72ef005e2042476f95ee0b8839e9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Sep 2017 16:31:42 -0700 Subject: kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly On an early VMLAUNCH/VMRESUME failure (i.e. one which sets the VM-instruction error field of the current VMCS), the launch state of the current VMCS is not set to "launched," and the VM-exit information fields of the current VMCS (including IDT-vectoring information and exit reason) are stale. On a late VMLAUNCH/VMRESUME failure (i.e. one which sets the high bit of the exit reason field), the launch state of the current VMCS is not set to "launched," and only two of the VM-exit information fields of the current VMCS are modified (exit reason and exit qualification). The remaining VM-exit information fields of the current VMCS (including IDT-vectoring information, in particular) are stale. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f85ed5a329..a0a78f09b22d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - /* * eager fpu is enabled if PKEY is supported and CR4 is switched * back on host, so it is safe to read guest PKRU from current @@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vmx->nested.nested_run_pending = 0; + vmx->idt_vectoring_info = 0; + + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return; + + vmx->loaded_vmcs->launched = 1; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); -- cgit v1.2.3 From 4f350c6dbcb9000e18907515ec8a7b205ac33c69 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Sep 2017 16:31:44 -0700 Subject: kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly When emulating a nested VM-entry from L1 to L2, several control field validation checks are deferred to the hardware. Should one of these validation checks fail, vcpu_vmx_run will set the vmx->fail flag. When this happens, the L2 guest state is not loaded (even in part), and execution should continue in L1 with the next instruction after the VMLAUNCH/VMRESUME. The VMCS12 is not modified (except for the VM-instruction error field), the VMCS12 MSR save/load lists are not processed, and the CPU state is not loaded from the VMCS12 host area. Moreover, the vmcs02 exit reason is stale, so it should not be consulted for any reason. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 134 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a0a78f09b22d..a406afbb6d21 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + if (vmx->nested.nested_run_pending) + return false; + + if (unlikely(vmx->fail)) { + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); + return true; + } /* * The host physical addresses of some pages of guest memory @@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) */ nested_mark_vmcs12_pages_dirty(vcpu); - if (vmx->nested.nested_run_pending) - return false; - - if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; - } + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: @@ -11395,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. + */ + WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD)); + leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (likely(!vmx->fail)) { + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); - if (unlikely(vmx->fail)) - vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + } vmx_switch_vmcs(vcpu, &vmx->vmcs01); - - /* - * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of - * the VM-exit interrupt information (valid interrupt) is always set to - * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need - * kvm_cpu_has_interrupt(). See the commit message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); @@ -11443,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (VMCS02_POOL_SIZE == 0) nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - load_vmcs12_host_state(vcpu, vmcs12); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); @@ -11493,23 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (unlikely(vmx->fail)) { - /* - * After an early L2 VM-entry failure, we're now back - * in L1 which thinks it just finished a VMLAUNCH or - * VMRESUME instruction, so we need to set the failure - * flag and the VM-instruction error field of the VMCS - * accordingly. - */ - vmx->fail = 0; - nested_vmx_failValid(vcpu, vm_inst_error); - } - if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + /* + * TODO: SDM says that with acknowledge interrupt on + * exit, bit 31 of the VM-exit interrupt information + * (valid interrupt) is always set to 1 on + * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't + * need kvm_cpu_has_interrupt(). See the commit + * message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly. + */ + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + /* + * The emulated instruction was already skipped in + * nested_vmx_run, but the updated RIP was never + * written back to the vmcs01. + */ + skip_emulated_instruction(vcpu); + vmx->fail = 0; } /* -- cgit v1.2.3