From cf81a7e580ac0d598d3e7e9c06864168b2b4073d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jul 2018 09:54:30 -0700 Subject: KVM: vmx: remove save/restore of host BNDCGFS MSR Linux does not support Memory Protection Extensions (MPX) in the kernel itself, thus the BNDCFGS (Bound Config Supervisor) MSR will always be zero in the KVM host, i.e. RDMSR in vmx_save_host_state() is superfluous. KVM unconditionally sets VM_EXIT_CLEAR_BNDCFGS, i.e. BNDCFGS will always be zero after VMEXIT, thus manually loading BNDCFGS is also superfluous. And in the event the MPX kernel support is added (unlikely given that MPX for userspace is in its death throes[1]), BNDCFGS will likely be common across all CPUs[2], and at the least shouldn't change on a regular basis, i.e. saving the MSR on every VMENTRY is completely unnecessary. WARN_ONCE in hardware_setup() if the host's BNDCFGS is non-zero to document that KVM does not preserve BNDCFGS and to serve as a hint as to how BNDCFGS likely should be handled if MPX is used in the kernel, e.g. BNDCFGS should be saved once during KVM setup. [1] https://lkml.org/lkml/2018/4/27/1046 [2] http://www.openwall.com/lists/kernel-hardening/2017/07/24/28 Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e30da9a2430c..6318167b1464 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -802,7 +802,6 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; - u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -2630,8 +2629,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif - if (boot_cpu_has(X86_FEATURE_MPX)) - rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -2669,8 +2666,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (vmx->host_state.msr_host_bndcfgs) - wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); load_fixmap_gdt(raw_smp_processor_id()); } @@ -7502,6 +7497,7 @@ static void vmx_enable_tdp(void) static __init int hardware_setup(void) { + unsigned long host_bndcfgs; int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7526,6 +7522,11 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; -- cgit v1.2.3 From 7f7f1ba33cf2c21d001821313088c231db42ff40 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Jul 2018 18:49:01 +0200 Subject: KVM: x86: do not load vmcs12 pages while still in SMM If the vCPU enters system management mode while running a nested guest, RSM starts processing the vmentry while still in SMM. In that case, however, the pages pointed to by the vmcs12 might be incorrectly loaded from SMRAM. To avoid this, delay the handling of the pages until just before the next vmentry. This is done with a new request and a new entry in kvm_x86_ops, which we will be able to reuse for nested VMX state migration. Extracted from a patch by Jim Mattson and KarimAllah Ahmed. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++-------------- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 41 insertions(+), 18 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..da957725992d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -75,6 +75,7 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) +#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -1085,6 +1086,8 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6318167b1464..fee44e4c5c79 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10660,9 +10660,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct page *page; u64 hpa; @@ -11774,12 +11774,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) +/* + * If exit_qual is NULL, this is being called from RSM. + * Otherwise it's called from vmlaunch/vmresume. + */ +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_qual; - int r; + bool from_vmentry = !!exit_qual; + u32 dummy_exit_qual; + int r = 0; enter_guest_mode(vcpu); @@ -11793,17 +11798,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) vcpu->arch.tsc_offset += vmcs12->tsc_offset; r = EXIT_REASON_INVALID_STATE; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) goto fail; - nested_get_vmcs12_pages(vcpu, vmcs12); + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); - r = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) - goto fail; + r = EXIT_REASON_MSR_LOAD_FAIL; + *exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (*exit_qual) + goto fail; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11818,8 +11834,7 @@ fail: vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); - return 1; + return r; } /* @@ -11896,10 +11911,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); vmx->nested.nested_run_pending = 0; - return ret; + return 1; } /* @@ -12985,7 +13001,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) if (vmx->nested.smm.guest_mode) { vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, NULL); vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; @@ -13134,6 +13150,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f0fabdb2109..fbd59ad047b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7260,6 +7260,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) + kvm_x86_ops->get_vmcs12_pages(vcpu); if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) -- cgit v1.2.3 From 8fcc4b5923af5de58b80b53a069453b135693304 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 10 Jul 2018 11:27:20 +0200 Subject: kvm: nVMX: Introduce KVM_CAP_NESTED_STATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For nested virtualization L0 KVM is managing a bit of state for L2 guests, this state can not be captured through the currently available IOCTLs. In fact the state captured through all of these IOCTLs is usually a mix of L1 and L2 state. It is also dependent on whether the L2 guest was running at the moment when the process was interrupted to save its state. With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM that is in VMX operation. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jim Mattson [karahmed@ - rename structs and functions and make them ready for AMD and address previous comments. - handle nested.smm state. - rebase & a bit of refactoring. - Merge 7/8 and 8/8 into one patch. ] Signed-off-by: KarimAllah Ahmed Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 56 ++++++++++++ arch/x86/include/asm/kvm_host.h | 6 ++ arch/x86/include/uapi/asm/kvm.h | 37 ++++++++ arch/x86/kvm/vmx.c | 175 +++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 54 ++++++++++++ include/uapi/linux/kvm.h | 4 + 6 files changed, 330 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index cb8db4f9d097..7b83b176c662 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3561,6 +3561,62 @@ Returns: 0 on success, -ENOENT on deassign if the conn_id isn't registered -EEXIST on assign if the conn_id is already registered +4.114 KVM_GET_NESTED_STATE + +Capability: KVM_CAP_NESTED_STATE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_nested_state (in/out) +Returns: 0 on success, -1 on error +Errors: + E2BIG: the total state size (including the fixed-size part of struct + kvm_nested_state) exceeds the value of 'size' specified by + the user; the size required will be written into size. + +struct kvm_nested_state { + __u16 flags; + __u16 format; + __u32 size; + union { + struct kvm_vmx_nested_state vmx; + struct kvm_svm_nested_state svm; + __u8 pad[120]; + }; + __u8 data[0]; +}; + +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +This ioctl copies the vcpu's nested virtualization state from the kernel to +userspace. + +The maximum size of the state, including the fixed-size part of struct +kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to +the KVM_CHECK_EXTENSION ioctl(). + +4.115 KVM_SET_NESTED_STATE + +Capability: KVM_CAP_NESTED_STATE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_nested_state (in) +Returns: 0 on success, -1 on error + +This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For +the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da957725992d..bd287b348751 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1086,6 +1086,12 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + int (*get_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c535c2fdea13..86299efa804a 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,4 +378,41 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +/* for KVM_CAP_NESTED_STATE */ +struct kvm_nested_state { + /* KVM_STATE_* flags */ + __u16 flags; + + /* 0 for VMX, 1 for SVM. */ + __u16 format; + + /* 128 for SVM, 128 + VMCS size for VMX. */ + __u32 size; + + union { + /* VMXON, VMCS */ + struct kvm_vmx_nested_state vmx; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + }; + + __u8 data[0]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fee44e4c5c79..4be6486173b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7589,6 +7589,11 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -11775,8 +11780,8 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * If exit_qual is NULL, this is being called from RSM. - * Otherwise it's called from vmlaunch/vmresume. + * If exit_qual is NULL, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. */ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { @@ -13016,6 +13021,170 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx->nested.current_vmptr != -1ull) + kvm_state.size += VMCS12_SIZE; + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (vmx->nested.current_vmptr == -1ull) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow vmcs linked to vmcs01, unless + * sync_shadow_vmcs is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) + sync_vmcs12(vcpu, vmcs12); + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + +out: + return kvm_state.size; +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vmx->nested.nested_run_pending = 1; + + vmx->nested.dirty_vmcs12 = true; + ret = enter_vmx_non_root_mode(vcpu, NULL); + if (ret) + return -EINVAL; + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -13150,6 +13319,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_nested_state = vmx_get_nested_state, + .set_nested_state = vmx_set_nested_state, .get_vmcs12_pages = nested_get_vmcs12_pages, .smi_allowed = vmx_smi_allowed, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbd59ad047b0..1b14c4a654c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2947,6 +2947,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; + case KVM_CAP_NESTED_STATE: + r = kvm_x86_ops->get_nested_state ? + kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + break; default: break; } @@ -3963,6 +3967,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_GET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + u32 user_data_size; + + r = -EINVAL; + if (!kvm_x86_ops->get_nested_state) + break; + + BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + if (get_user(user_data_size, &user_kvm_nested_state->size)) + return -EFAULT; + + r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + user_data_size); + if (r < 0) + return r; + + if (r > user_data_size) { + if (put_user(r, &user_kvm_nested_state->size)) + return -EFAULT; + return -E2BIG; + } + r = 0; + break; + } + case KVM_SET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + struct kvm_nested_state kvm_state; + + r = -EINVAL; + if (!kvm_x86_ops->set_nested_state) + break; + + if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) + return -EFAULT; + + if (kvm_state.size < sizeof(kvm_state)) + return -EINVAL; + + if (kvm_state.flags & + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + /* nested_run_pending implies guest_mode. */ + if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) + return -EINVAL; + + r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + break; + } default: r = -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b955b986b341..3cf632839337 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -950,6 +950,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_EVENTFD 154 #define KVM_CAP_HYPERV_TLBFLUSH 155 #define KVM_CAP_S390_HPAGE_1M 156 +#define KVM_CAP_NESTED_STATE 157 #ifdef KVM_CAP_IRQ_ROUTING @@ -1392,6 +1393,9 @@ struct kvm_enc_region { /* Available with KVM_CAP_HYPERV_EVENTFD */ #define KVM_HYPERV_EVENTFD _IOW(KVMIO, 0xbd, struct kvm_hyperv_eventfd) +/* Available with KVM_CAP_NESTED_STATE */ +#define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) +#define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) /* Secure Encrypted Virtualization command */ enum sev_cmd_id { -- cgit v1.2.3 From 392b2f25aa415c0222b348f95875409be49a1201 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:01 +0300 Subject: KVM: VMX: Create struct for VMCS header No functionality change. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4be6486173b7..37944b79da45 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -198,8 +198,13 @@ struct kvm_vmx { #define NR_AUTOLOAD_MSRS 8 +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + struct vmcs { - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; char data[0]; }; @@ -253,7 +258,7 @@ struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ @@ -421,7 +426,7 @@ struct __packed vmcs12 { "Offset of " #field " in struct vmcs12 has changed.") static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(revision_id, 0); + CHECK_OFFSET(hdr, 0); CHECK_OFFSET(abort, 4); CHECK_OFFSET(launch_state, 8); CHECK_OFFSET(io_bitmap_a, 40); @@ -4399,9 +4404,9 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) /* KVM supports Enlightened VMCS v1 only */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = KVM_EVMCS_VERSION; + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; return vmcs; } @@ -4581,7 +4586,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; } @@ -7889,7 +7894,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!shadow_vmcs) goto out_shadow_vmcs; /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); + shadow_vmcs->hdr.shadow_vmcs = 1; /* init shadow vmcs */ vmcs_clear(shadow_vmcs); vmx->vmcs01.shadow_vmcs = shadow_vmcs; @@ -8459,7 +8464,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); - if (new_vmcs12->revision_id != VMCS12_REVISION) { + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + new_vmcs12->hdr.shadow_vmcs) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, @@ -13161,7 +13167,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) return -EFAULT; - if (vmcs12->revision_id != VMCS12_REVISION) + if (vmcs12->hdr.revision_id != VMCS12_REVISION) return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) -- cgit v1.2.3 From e253674227328575084cf7454d22749b9be11d52 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:02 +0300 Subject: KVM: VMX: Change vmcs12_{read,write}_any() to receive vmcs12 as parameter No functionality change. This is done as a preparation for VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 37944b79da45..82b01b34a3a2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8159,7 +8159,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); @@ -8168,7 +8168,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - p = ((char *)(get_vmcs12(vcpu))) + offset; + p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: @@ -8190,10 +8190,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, } -static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); - char *p = ((char *) get_vmcs12(vcpu)) + offset; + char *p = (char *)vmcs12 + offset; if (offset < 0) return offset; @@ -8246,7 +8246,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; field_value = __vmcs_readl(field); - vmcs12_write_any(&vmx->vcpu, field, field_value); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); } /* * Skip the VM-exit information fields if they are read-only. @@ -8281,7 +8281,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(&vmx->vcpu, field, &field_value); + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); __vmcs_writel(field, field_value); } } @@ -8321,7 +8321,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vcpu, field, &field_value) < 0) { + if (vmcs12_read_any(get_vmcs12(vcpu), field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8397,7 +8397,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(vcpu, field, field_value) < 0) { + if (vmcs12_write_any(get_vmcs12(vcpu), field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -10971,11 +10971,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, unsigned long addr_field) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int maxphyaddr; u64 count, addr; - if (vmcs12_read_any(vcpu, count_field, &count) || - vmcs12_read_any(vcpu, addr_field, &addr)) { + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { WARN_ON(1); return -EINVAL; } -- cgit v1.2.3 From fa97d7dba7538adf174c3984e726c0537ab553a4 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Wed, 18 Jul 2018 14:07:59 +0200 Subject: KVM: nVMX: Allow VMPTRLD for shadow VMCS if vCPU supports VMCS shadowing Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 82b01b34a3a2..44e2b82f6519 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1720,6 +1720,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) CPU_BASED_MONITOR_TRAP_FLAG; } +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -8465,7 +8471,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } new_vmcs12 = kmap(page); if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - new_vmcs12->hdr.shadow_vmcs) { + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, -- cgit v1.2.3 From a6192d40d52f6b86997a71449e2ebc3d7f5ca103 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:04 +0300 Subject: KVM: nVMX: Fail VMLAUNCH and VMRESUME on shadow VMCS Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 44e2b82f6519..f6ec898ac539 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11876,6 +11876,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12 = get_vmcs12(vcpu); + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) { + nested_vmx_failInvalid(vcpu); + goto out; + } + if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); -- cgit v1.2.3 From f792d2743ed4db9e96eff43bdc1e15dd441f19bf Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:05 +0300 Subject: KVM: nVMX: Introduce nested_cpu_has_shadow_vmcs() Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f6ec898ac539..64b11c57b5f4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1806,6 +1806,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) VMX_VMFUNC_EPTP_SWITCHING); } +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -11745,7 +11750,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && + if (!nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; -- cgit v1.2.3 From a8a7c02bf7b70cda6face6321a45de56519c24bf Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:06 +0300 Subject: KVM: nVMX: Verify VMCS shadowing controls Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64b11c57b5f4..e762222476a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11038,6 +11038,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -11639,6 +11652,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high) || -- cgit v1.2.3 From f145d90d97bab0e11b78da1739e5db742575037c Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:07 +0300 Subject: KVM: nVMX: Verify VMCS shadowing VMCS link pointer Intel SDM considers these checks to be part of "Checks on Guest Non-Register State". Note that it is legal for vmcs->vmcs_link_pointer to be -1ull when VMCS shadowing is enabled. In this case, any VMREAD/VMWRITE to shadowed-field sets the ALU flags for VMfailInvalid (i.e. CF=1). Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e762222476a9..f8ad42bcfc2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11755,6 +11755,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return 0; } +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) { @@ -11766,8 +11793,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } -- cgit v1.2.3 From 61ada7488ffdef0c0f83da14a12629870abb9e97 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:08 +0300 Subject: KVM: nVMX: Cache shadow vmcs12 on VMEntry and flush to memory on VMExit This is done is done as a preparation to VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8ad42bcfc2d..2a005f519032 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -644,6 +644,12 @@ struct nested_vmx { * memory during VMCLEAR and VMPTRLD. */ struct vmcs12 *cached_vmcs12; + /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 @@ -1076,6 +1082,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); @@ -7900,6 +7911,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -7919,6 +7934,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) return 0; out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: @@ -8085,6 +8103,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -10926,6 +10945,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -11995,6 +12046,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } + /* + * Must happen outside of enter_vmx_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu. @@ -12504,6 +12567,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); -- cgit v1.2.3 From fa58a9fa74979f845fc6c94a58501e67f3abb6de Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Jul 2018 19:45:51 +0200 Subject: KVM: nVMX: include shadow vmcs12 in nested state The shadow vmcs12 cannot be flushed on KVM_GET_NESTED_STATE, because at that point guest memory is assumed by userspace to be immutable. Capture the cache in vmx_get_nested_state, adding another page at the end if there is an active shadow vmcs12. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a005f519032..c7d08a54ab8d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -13191,9 +13191,15 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; - if (vmx->nested.current_vmptr != -1ull) + if (vmx->nested.current_vmptr != -1ull) { kvm_state.size += VMCS12_SIZE; + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + if (vmx->nested.smm.vmxon) kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; @@ -13232,6 +13238,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) return -EFAULT; + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + out: return kvm_state.size; } @@ -13316,6 +13329,22 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + if (check_vmentry_prereqs(vcpu, vmcs12) || check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; -- cgit v1.2.3 From 6d894f498f5d121b57a231315b0b459e185abed1 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:09 +0300 Subject: KVM: nVMX: vmread/vmwrite: Use shadow vmcs12 if running L2 This is done as a preparation to VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c7d08a54ab8d..f91a1599738e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8341,6 +8341,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8348,10 +8349,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_vmcs12(vcpu)) return kvm_skip_emulated_instruction(vcpu); + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + } + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(get_vmcs12(vcpu), field, &field_value) < 0) { + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8393,6 +8408,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8427,23 +8443,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(get_vmcs12(vcpu), field, field_value) < 0) { + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } - switch (field) { + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { #define SHADOW_FIELD_RW(x) case x: #include "vmx_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } } nested_vmx_succeed(vcpu); -- cgit v1.2.3 From a7cde481b6e8506ee147cf5031ad1917fcc8bf9b Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:10 +0300 Subject: KVM: nVMX: Do not forward VMREAD/VMWRITE VMExits to L1 if required so by vmcs12 vmread/vmwrite bitmaps This is done as a preparation for VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f91a1599738e..27fd3e1653af 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9107,6 +9107,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + /* * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we * should handle it ourselves in L0 (and then continue L2). Only call this @@ -9191,10 +9215,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* -- cgit v1.2.3 From 32c7acf044873c8782be45a03e46cf0ac579a459 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:11 +0300 Subject: KVM: nVMX: Expose VMCS shadowing to L1 guest Expose VMCS shadowing to L1 as a VMX capability of the virtual CPU, whether or not VMCS shadowing is supported by the physical CPU. (VMCS shadowing emulation) Shadowed VMREADs and VMWRITEs from L2 are handled by L0, without a VM-exit to L1. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 27fd3e1653af..7cf64cbc3afd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3308,6 +3308,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -11550,6 +11556,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); -- cgit v1.2.3 From 491a6038458fc07ec307447dd5e6722e34d67c04 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:12 +0300 Subject: KVM: VMX: Mark vmcs header as shadow in case alloc_vmcs_cpu() allocate shadow vmcs No functionality change. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7cf64cbc3afd..098926a07cf7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4418,7 +4418,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) { int node = cpu_to_node(cpu); struct page *pages; @@ -4436,6 +4436,8 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) else vmcs->hdr.revision_id = vmcs_config.revision_id; + if (shadow) + vmcs->hdr.shadow_vmcs = 1; return vmcs; } @@ -4459,14 +4461,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } -static struct vmcs *alloc_vmcs(void) +static struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); } static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { - loaded_vmcs->vmcs = alloc_vmcs(); + loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; @@ -4597,7 +4599,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(false, cpu); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -7922,11 +7924,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) goto out_cached_shadow_vmcs12; if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); + shadow_vmcs = alloc_vmcs(true); if (!shadow_vmcs) goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->hdr.shadow_vmcs = 1; /* init shadow vmcs */ vmcs_clear(shadow_vmcs); vmx->vmcs01.shadow_vmcs = shadow_vmcs; -- cgit v1.2.3 From abfc52c612ddb101549846688bc87dfedbbfd4f4 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:13 +0300 Subject: KVM: nVMX: Separate logic allocating shadow vmcs to a function No functionality change. This is done as a preparation for VMCS shadowing virtualization. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 098926a07cf7..cbe31b9d2b81 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7905,10 +7905,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) return 0; } +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); @@ -7923,14 +7948,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(true); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); -- cgit v1.2.3 From 50c28f21d045dde8c52548f8482d456b3f0956f5 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:11 -0700 Subject: kvm: x86: Use fast CR3 switch for nested VMX Use the fast CR3 switch mechanism to locklessly change the MMU root page when switching between L1 and L2. The switch from L2 to L1 should always go through the fast path, while the switch from L1 to L2 should go through the fast path if L1's CR3/EPTP for L2 hasn't changed since the last time. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 ++++-- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/vmx.c | 16 ++++++++++------ virt/kvm/kvm_main.c | 14 ++++++++++---- 4 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d822a97dfa1..9b73cfcef917 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4054,7 +4054,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; swap(mmu->root_hpa, mmu->prev_root.hpa); - mmu->prev_root.cr3 = kvm_read_cr3(vcpu); + mmu->prev_root.cr3 = mmu->get_cr3(vcpu); if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa) && @@ -4091,6 +4091,7 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) { __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu)); } +EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); static unsigned long get_cr3(struct kvm_vcpu *vcpu) { @@ -4725,12 +4726,13 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty) + bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.mmu; union kvm_mmu_page_role root_page_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 11ab3d62ad65..3177127cee96 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,7 +64,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty); + bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cbe31b9d2b81..c6ea892a610e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10741,11 +10741,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) return 1; - kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu)); + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -11342,12 +11342,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 1; } } - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } - kvm_mmu_reset_context(vcpu); + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f519eb8d06b1..8f461e0ed382 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2127,16 +2127,22 @@ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) { + int ret = -EINTR; + int idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_arch_vcpu_runnable(vcpu)) { kvm_make_request(KVM_REQ_UNHALT, vcpu); - return -EINTR; + goto out; } if (kvm_cpu_has_pending_timer(vcpu)) - return -EINTR; + goto out; if (signal_pending(current)) - return -EINTR; + goto out; - return 0; + ret = 0; +out: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + return ret; } /* -- cgit v1.2.3 From afe828d1de4047d26eb0cd0c0154f5ac3722bf63 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:12 -0700 Subject: kvm: x86: Add ability to skip TLB flush when switching CR3 Remove the implicit flush from the set_cr3 handlers, so that the callers are able to decide whether to flush the TLB or not. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 ++ arch/x86/kvm/mmu.h | 1 - arch/x86/kvm/svm.c | 4 ---- arch/x86/kvm/vmx.c | 1 - 4 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b73cfcef917..d3a04cf6514b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4070,6 +4070,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); @@ -4851,6 +4852,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_cr3(vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); out: return r; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3177127cee96..6a2a97d8015b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -88,7 +88,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) { - /* set_cr3() should ensure TLB has been flushed */ if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f059a73f0fd0..be9931cad409 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -5766,7 +5765,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5779,8 +5777,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) /* Also sync guest cr3 here in case we live migrate */ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - - svm_flush_tlb(vcpu, true); } static int is_disabled(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6ea892a610e..903c130bb811 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5029,7 +5029,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } -- cgit v1.2.3 From eb4b248e152d3ecf189b9d32c04961360dbd938a Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:14 -0700 Subject: kvm: vmx: Support INVPCID in shadow paging mode Implement support for INVPCID in shadow paging mode as well. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 18 ++++++++ arch/x86/kvm/vmx.c | 96 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c2b4df8a03cd..1c253f15f9cd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1317,6 +1317,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3); void kvm_enable_tdp(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d3a04cf6514b..2af15db12ccd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5187,6 +5187,24 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) +{ + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + if (pcid == kvm_get_active_pcid(vcpu)) { + mmu->invlpg(vcpu, gva); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + ++vcpu->stat.invlpg; + + /* + * Mappings not reachable via the current cr3 will be synced when + * switching to that cr3, so nothing needs to be done here for them. + */ +} +EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); + void kvm_enable_tdp(void) { tdp_enabled = true; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 903c130bb811..80be024cb979 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3065,7 +3065,7 @@ static bool vmx_rdtscp_supported(void) static bool vmx_invpcid_supported(void) { - return cpu_has_vmx_invpcid() && enable_ept; + return cpu_has_vmx_invpcid(); } /* @@ -6101,8 +6101,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; - /* Enable INVPCID for non-ept guests may cause performance regression. */ - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -8756,6 +8754,97 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + struct { + u64 pcid; + u64 gla; + } operand; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + /* + * If the current cr3 does not use the given PCID, then nothing + * needs to be done here because a resync will happen anyway + * before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} + static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -8959,6 +9048,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; -- cgit v1.2.3 From ade61e2824443a208bb3aaafd8b345ce878298cd Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:15 -0700 Subject: kvm: x86: Skip TLB flush on fast CR3 switch when indicated by guest When PCIDs are enabled, the MSb of the source operand for a MOV-to-CR3 instruction indicates that the TLB doesn't need to be flushed. This change enables this optimization for MOV-to-CR3s in the guest that have been intercepted by KVM for shadow paging and are handled within the fast CR3 switch path. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 28 +++++++++++++++++++--------- arch/x86/kvm/vmx.c | 12 ++++++++---- arch/x86/kvm/x86.c | 11 ++++++++--- 4 files changed, 36 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1c253f15f9cd..8b4aa5e7ff92 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1318,7 +1318,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); void kvm_enable_tdp(void); void kvm_disable_tdp(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2af15db12ccd..2b7cfc8e41bb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4037,7 +4037,8 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, } static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, - union kvm_mmu_page_role new_role) + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) { struct kvm_mmu *mmu = &vcpu->arch.mmu; @@ -4070,7 +4071,9 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_x86_ops->tlb_flush(vcpu, true); + if (!skip_tlb_flush) + kvm_x86_ops->tlb_flush(vcpu, true); + __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); @@ -4082,15 +4085,17 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, } static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, - union kvm_mmu_page_role new_role) + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) { - if (!fast_cr3_switch(vcpu, new_cr3, new_role)) + if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) kvm_mmu_free_roots(vcpu, false); } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) { - __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu)); + __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), + skip_tlb_flush); } EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); @@ -4733,7 +4738,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, union kvm_mmu_page_role root_page_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); - __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role); + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -5196,11 +5201,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (VALID_PAGE(mmu->prev_root.hpa) && + pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + ++vcpu->stat.invlpg; /* - * Mappings not reachable via the current cr3 will be synced when - * switching to that cr3, so nothing needs to be done here for them. + * Mappings not reachable via the current cr3 or the prev_root.cr3 will + * be synced when switching to that cr3, so nothing needs to be done + * here for them. */ } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 80be024cb979..6151418cec32 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8819,10 +8819,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3) + == operand.pcid) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + /* - * If the current cr3 does not use the given PCID, then nothing - * needs to be done here because a resync will happen anyway - * before switching to any other CR3. + * If neither the current cr3 nor the prev_root.cr3 use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. */ return kvm_skip_emulated_instruction(vcpu); @@ -11434,7 +11438,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne } if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3); + kvm_mmu_new_cr3(vcpu, cr3, false); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7748037b17fd..493afbf12e78 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -847,16 +847,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + bool skip_tlb_flush = false; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - if (pcid_enabled) + if (pcid_enabled) { + skip_tlb_flush = cr3 & CR3_PCID_INVD; cr3 &= ~CR3_PCID_INVD; + } #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + if (!skip_tlb_flush) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -867,7 +872,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3); + kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -- cgit v1.2.3 From 956bf3531fba53c0501eda4fbc67950b0f7b913f Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:18 -0700 Subject: kvm: x86: Skip shadow page resync on CR3 switch when indicated by guest When the guest indicates that the TLB doesn't need to be flushed in a CR3 switch, we can also skip resyncing the shadow page tables since an out-of-sync shadow page table is equivalent to an out-of-sync TLB. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++--- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- 3 files changed, 34 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0f6965ce016a..9446a36a4ab7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4098,9 +4098,19 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, */ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - if (!skip_tlb_flush) + if (!skip_tlb_flush) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_x86_ops->tlb_flush(vcpu, true); + } + + /* + * The last MMIO access's GVA and GPA are cached in the + * VCPU. When switching to a new CR3, that GVA->GPA + * mapping may no longer be valid. So clear any cached + * MMIO info even when we don't need to sync the shadow + * page tables. + */ + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); @@ -5217,6 +5227,21 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) struct kvm_mmu *mmu = &vcpu->arch.mmu; mmu->invlpg(vcpu, gva, mmu->root_hpa); + + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether the prev_root mapping of the VA is + * marked global, or to just sync it blindly, so we might as well just + * always sync it. + * + * Mappings not reachable via the current cr3 or the prev_root.cr3 will + * be synced when switching to that cr3, so nothing needs to be done + * here for them. + */ + if (VALID_PAGE(mmu->prev_root.hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } @@ -5232,8 +5257,10 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (VALID_PAGE(mmu->prev_root.hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) + pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) { + mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } ++vcpu->stat.invlpg; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6151418cec32..b81210051133 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8821,7 +8821,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3) == operand.pcid) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS); /* * If neither the current cr3 nor the prev_root.cr3 use the diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 493afbf12e78..aa5d96b4b386 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -858,10 +858,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - kvm_mmu_sync_roots(vcpu); - - if (!skip_tlb_flush) + if (!skip_tlb_flush) { + kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } return 0; } -- cgit v1.2.3 From faff87588d8bfd9e56e9203412f0bb80455da7b9 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 29 Jun 2018 13:10:05 -0700 Subject: kvm: x86: Flush only affected TLB entries in kvm_mmu_invlpg* This needs a minor bug fix. The updated patch is as follows. Thanks, Junaid ------------------------------------------------------------------------------ kvm_mmu_invlpg() and kvm_mmu_invpcid_gva() only need to flush the TLB entries for the specific guest virtual address, instead of flushing all TLB entries associated with the VM. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 ++++++++ arch/x86/kvm/mmu.c | 14 +++++++++++--- arch/x86/kvm/svm.c | 8 ++++++++ arch/x86/kvm/vmx.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 262b0bc64dfc..dcdc9150bb76 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -985,6 +985,14 @@ struct kvm_x86_ops { void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + /* + * Flush any TLB entries associated with the given GVA. + * Does not need to flush GPA->HPA mappings. + * Can potentially get non-canonical addresses through INVLPGs, which + * the implementation may choose to ignore if appropriate. + */ + void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9446a36a4ab7..f84c194e6db1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5226,6 +5226,10 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_mmu *mmu = &vcpu->arch.mmu; + /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + mmu->invlpg(vcpu, gva, mmu->root_hpa); /* @@ -5242,7 +5246,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) if (VALID_PAGE(mmu->prev_root.hpa)) mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_x86_ops->tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5250,18 +5254,22 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool tlb_flush = false; if (pcid == kvm_get_active_pcid(vcpu)) { mmu->invlpg(vcpu, gva, mmu->root_hpa); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + tlb_flush = true; } if (VALID_PAGE(mmu->prev_root.hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) { mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + tlb_flush = true; } + if (tlb_flush) + kvm_x86_ops->tlb_flush_gva(vcpu, gva); + ++vcpu->stat.invlpg; /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be9931cad409..73e27a98456f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5434,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) svm->asid_generation--; } +static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + invlpga(gva, svm->vmcb->control.asid); +} + static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { } @@ -7086,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_rflags = svm_set_rflags, .tlb_flush = svm_flush_tlb, + .tlb_flush_gva = svm_flush_tlb_gva, .run = svm_vcpu_run, .handle_exit = handle_exit, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b81210051133..5aea5af02386 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1992,6 +1992,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) @@ -4833,6 +4846,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + int vpid = to_vmx(vcpu)->vpid; + + if (!vpid_sync_vcpu_addr(vpid, addr)) + vpid_sync_context(vpid); + + /* + * If VPIDs are not supported or enabled, then the above is a no-op. + * But we don't really need a TLB flush in that case anyway, because + * each VM entry/exit includes an implicit flush when VPID is 0. + */ +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -13603,6 +13630,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, -- cgit v1.2.3 From b94742c958f0b97d304d4aecb4603a20ee9a2df3 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:20 -0700 Subject: kvm: x86: Add multi-entry LRU cache for previous CR3s Adds support for storing multiple previous CR3/root_hpa pairs maintained as an LRU cache, so that the lockless CR3 switch path can be used when switching back to any of them. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 10 ++-- arch/x86/kvm/mmu.c | 111 ++++++++++++++++++++++++++++------------ arch/x86/kvm/vmx.c | 12 +++-- 3 files changed, 92 insertions(+), 41 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dcdc9150bb76..4f1983640bda 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -335,6 +335,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_ROOT_INFO_INVALID \ ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) +#define KVM_MMU_NUM_PREV_ROOTS 3 + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -363,7 +365,7 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; - struct kvm_mmu_root_info prev_root; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault @@ -1295,9 +1297,9 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -#define KVM_MMU_ROOT_CURRENT BIT(0) -#define KVM_MMU_ROOT_PREVIOUS BIT(1) -#define KVM_MMU_ROOTS_ALL (~0UL) +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (~0UL) int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f84c194e6db1..687bfb03b9ca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3445,18 +3445,26 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; - bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS; + + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ - if (!(free_active_root && VALID_PAGE(mmu->root_hpa)) && - !(free_prev_root && VALID_PAGE(mmu->prev_root.hpa))) - return; + if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && + VALID_PAGE(mmu->prev_roots[i].hpa)) + break; + + if (i == KVM_MMU_NUM_PREV_ROOTS) + return; + } spin_lock(&vcpu->kvm->mmu_lock); - if (free_prev_root) - mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa, - &invalid_list); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) + mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && @@ -4064,6 +4072,38 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } +/* + * Find out if a previously cached root matching the new CR3/role is available. + * The current root is also inserted into the cache. + * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is + * returned. + * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and + * false is returned. This root should now be freed by the caller. + */ +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role) +{ + uint i; + struct kvm_mmu_root_info root; + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + root.cr3 = mmu->get_cr3(vcpu); + root.hpa = mmu->root_hpa; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + swap(root, mmu->prev_roots[i]); + + if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) && + page_header(root.hpa) != NULL && + new_role.word == page_header(root.hpa)->role.word) + break; + } + + mmu->root_hpa = root.hpa; + + return i < KVM_MMU_NUM_PREV_ROOTS; +} + static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, union kvm_mmu_page_role new_role, bool skip_tlb_flush) @@ -4077,18 +4117,10 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) { - gpa_t prev_cr3 = mmu->prev_root.cr3; - if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) return false; - swap(mmu->root_hpa, mmu->prev_root.hpa); - mmu->prev_root.cr3 = mmu->get_cr3(vcpu); - - if (new_cr3 == prev_cr3 && - VALID_PAGE(mmu->root_hpa) && - page_header(mmu->root_hpa) != NULL && - new_role.word == page_header(mmu->root_hpa)->role.word) { + if (cached_root_available(vcpu, new_cr3, new_role)) { /* * It is possible that the cached previous root page is * obsolete because of a change in the MMU @@ -4854,8 +4886,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) { if (reset_roots) { + uint i; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; } if (mmu_is_nested(vcpu)) @@ -5225,6 +5261,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_mmu *mmu = &vcpu->arch.mmu; + int i; /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_address(gva, vcpu)) @@ -5235,16 +5272,17 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) /* * INVLPG is required to invalidate any global mappings for the VA, * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether the prev_root mapping of the VA is - * marked global, or to just sync it blindly, so we might as well just - * always sync it. + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. * - * Mappings not reachable via the current cr3 or the prev_root.cr3 will - * be synced when switching to that cr3, so nothing needs to be done - * here for them. + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. */ - if (VALID_PAGE(mmu->prev_root.hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); kvm_x86_ops->tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; @@ -5255,16 +5293,19 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = &vcpu->arch.mmu; bool tlb_flush = false; + uint i; if (pcid == kvm_get_active_pcid(vcpu)) { mmu->invlpg(vcpu, gva, mmu->root_hpa); tlb_flush = true; } - if (VALID_PAGE(mmu->prev_root.hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) { - mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); - tlb_flush = true; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (VALID_PAGE(mmu->prev_roots[i].hpa) && + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + tlb_flush = true; + } } if (tlb_flush) @@ -5273,9 +5314,9 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) ++vcpu->stat.invlpg; /* - * Mappings not reachable via the current cr3 or the prev_root.cr3 will - * be synced when switching to that cr3, so nothing needs to be done - * here for them. + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. */ } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); @@ -5321,12 +5362,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { + uint i; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.mmu.translate_gpa = translate_gpa; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + return alloc_mmu_pages(vcpu); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5aea5af02386..97e4d71431a1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8788,6 +8788,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) bool pcid_enabled; gva_t gva; struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; @@ -8846,12 +8848,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } - if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3) - == operand.pcid) - kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + kvm_mmu_free_roots(vcpu, roots_to_free); /* - * If neither the current cr3 nor the prev_root.cr3 use the + * If neither the current cr3 nor any of the prev_roots use the * given PCID, then nothing needs to be done here because a * resync will happen anyway before switching to any other CR3. */ -- cgit v1.2.3 From 877ad952be3d51445f6a74dd63708a9327c8f19d Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 19 Jul 2018 08:40:23 +0000 Subject: KVM: vmx: Add tlb_remote_flush callback support Register tlb_remote_flush callback for vmx when hyperv capability of nested guest mapping flush is detected. The interface can help to reduce overhead when flush ept table among vcpus for nested VM. The tradition way is to send IPIs to all affected vcpus and executes INVEPT on each vcpus. It will trigger several vmexits for IPI and INVEPT emulation. Hyper-V provides such hypercall to do flush for all vcpus and call the hypercall when all ept table pointers of single VM are same. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 97e4d71431a1..6a95e6b1a43d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -188,12 +188,21 @@ module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + struct kvm_vmx { struct kvm kvm; unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; }; #define NR_AUTOLOAD_MSRS 8 @@ -863,6 +872,7 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; }; enum segment_cache_field { @@ -1357,6 +1367,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) * GUEST_IA32_RTIT_CTL = 0x00002814, */ } + +/* check_ept_pointer() should be under protection of ept_pointer_lock. */ +static void check_ept_pointer_match(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + u64 tmp_eptp = INVALID_PAGE; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!VALID_PAGE(tmp_eptp)) { + tmp_eptp = to_vmx(vcpu)->ept_pointer; + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_MISMATCH; + return; + } + } + + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; +} + +static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +{ + int ret; + + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) + check_ept_pointer_match(kvm); + + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + ret = -ENOTSUPP; + goto out; + } + + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + +out: + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + return ret; +} #else /* !IS_ENABLED(CONFIG_HYPERV) */ static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -5041,6 +5093,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm *kvm = vcpu->kvm; unsigned long guest_cr3; u64 eptp; @@ -5048,11 +5101,20 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); + + if (kvm_x86_ops->tlb_remote_flush) { + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + to_vmx(vcpu)->ept_pointer = eptp; + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_CHECK; + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + } + if (enable_unrestricted_guest || is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -7622,6 +7684,12 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; @@ -10665,6 +10733,8 @@ free_vcpu: static int vmx_vm_init(struct kvm *kvm) { + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + if (!ple_gap) kvm->arch.pause_in_guest = true; return 0; -- cgit v1.2.3 From 36090bf43a6b835a42f515cb515ff6fa293a25fe Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 27 Jul 2018 09:18:50 -0700 Subject: kvm: nVMX: Fix fault vector for VMX operation at CPL > 0 The fault that should be raised for a privilege level violation is #GP rather than #UD. Fixes: 727ba748e110b4 ("kvm: nVMX: Enforce cpl=0 for VMX instructions") Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a95e6b1a43d..d19bb602ff07 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8096,7 +8096,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* CPL=0 must be checked manually. */ if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 1; } @@ -8160,7 +8160,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 0; } -- cgit v1.2.3 From e49fcb8b9ef26dfb2d02b173d790e1ef41177121 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 27 Jul 2018 13:44:45 -0700 Subject: kvm: nVMX: Fix fault priority for VMX operations When checking emulated VMX instructions for faults, the #UD for "IF (not in VMX operation)" should take precedence over the #GP for "ELSIF CPL > 0." Suggested-by: Eric Northup Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d19bb602ff07..ccc9d75124a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8159,15 +8159,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); + if (!to_vmx(vcpu)->nested.vmxon) { + kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); return 0; } + return 1; } -- cgit v1.2.3 From e368b875a8a91125f1bd0ffac1100c305c0bdff4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:41 -0700 Subject: KVM: vmx: refactor segmentation code in vmx_save_host_state() Use local variables in vmx_save_host_state() to temporarily track the selector and base values for FS and GS, and reorganize the code so that the 64-bit vs 32-bit portions are contained within a single #ifdef. This refactoring paves the way for future patches to modify the updating of VMCS state with minimal changes to the code, and (hopefully) simplifies resolving a likely conflict with another in-flight patch[1] by being the whipping boy for future patches. [1] https://www.spinics.net/lists/kvm/msg171647.html Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 53 ++++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ccc9d75124a6..06fd598ee8a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2662,8 +2662,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); - unsigned long fs_base, kernel_gs_base; #endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; int i; if (vmx->host_state.loaded) @@ -2678,49 +2679,51 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; #ifdef CONFIG_X86_64 + savesegment(ds, vmx->host_state.ds_sel); + savesegment(es, vmx->host_state.es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - kernel_gs_base = current->thread.gsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; } else { -#endif - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); -#ifdef CONFIG_X86_64 + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } + + if (is_long_mode(&vmx->vcpu)) + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); #endif - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + + vmx->host_state.fs_sel = fs_sel; + if (!(fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, fs_sel); vmx->host_state.fs_reload_needed = 0; } else { vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + vmx->host_state.gs_sel = gs_sel; + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); else { vmcs_write16(HOST_GS_SELECTOR, 0); vmx->host_state.gs_ldt_reload_needed = 1; } -#ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); - vmcs_writel(HOST_FS_BASE, fs_base); - vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); + vmcs_writel(HOST_GS_BASE, gs_base); - vmx->msr_host_kernel_gs_base = kernel_gs_base; - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, -- cgit v1.2.3 From bd9966de4e14fb559e89a06f7f5c9aab2cc028b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:42 -0700 Subject: KVM: vmx: track host_state.loaded using a loaded_vmcs pointer Using 'struct loaded_vmcs*' to track whether the CPU registers contain host or guest state kills two birds with one stone. 1. The (effective) boolean host_state.loaded is poorly named. It does not track whether or not host state is loaded into the CPU registers (which most readers would expect), but rather tracks if host state has been saved AND guest state is loaded. 2. Using a loaded_vmcs pointer provides a more robust framework for the optimized guest/host state switching, especially when consideration per-VMCS enhancements. To that end, WARN_ONCE if we try to switch to host state with a different VMCS than was last used to save host state. Resolve an occurrence of the new WARN by setting loaded_vmcs after the call to vmx_vcpu_put() in vmx_switch_vmcs(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06fd598ee8a6..935b84f40f65 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -804,18 +804,22 @@ struct vcpu_vmx { /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { unsigned nr; struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; } msr_autoload; + struct { - int loaded; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 u16 ds_sel, es_sel; @@ -2667,10 +2671,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; - if (vmx->host_state.loaded) + if (vmx->loaded_cpu_state) return; - vmx->host_state.loaded = 1; + vmx->loaded_cpu_state = vmx->loaded_vmcs; + /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. @@ -2732,11 +2737,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) + if (!vmx->loaded_cpu_state) return; + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; + vmx->loaded_cpu_state = NULL; + #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -10596,8 +10604,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); } -- cgit v1.2.3 From 678e315e78a780dbef384b92339c8414309dbc11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:43 -0700 Subject: KVM: vmx: add dedicated utility to access guest's kernel_gs_base When lazy save/restore of MSR_KERNEL_GS_BASE was introduced[1], the MSR was intercepted in all modes and was only restored for the host when the guest is in 64-bit mode. So at the time, going through the full host restore prior to accessing MSR_KERNEL_GS_BASE was necessary to load host state and was not a significant waste of cycles. Later, MSR_KERNEL_GS_BASE interception was disabled for a 64-bit guest[2], and then unconditionally saved/restored for the host[3]. As a result, loading full host state is overkill for accesses to MSR_KERNEL_GS_BASE, and completely unnecessary when the guest is not in 64-bit mode. Add a dedicated utility to read/write the guest's MSR_KERNEL_GS_BASE (outside of the save/restore flow) to minimize the overhead incurred when accessing the MSR. When setting EFER, only decache the MSR if the new EFER will disable long mode. Removing out-of-band usage of vmx_load_host_state() also eliminates, or at least reduces, potential corner cases in its usage, which in turn will (hopefuly) make it easier to reason about future changes to the save/restore flow, e.g. optimization of saving host state. [1] commit 44ea2b1758d8 ("KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area") [2] commit 5897297bc228 ("KVM: VMX: Don't intercept MSR_KERNEL_GS_BASE") [3] commit c8770e7ba63b ("KVM: VMX: Fix host userspace gsbase corruption") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 935b84f40f65..72c392525f61 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2772,13 +2772,31 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) load_fixmap_gdt(raw_smp_processor_id()); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - preempt_disable(); - __vmx_load_host_state(vmx); - preempt_enable(); + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, + vmx->msr_guest_kernel_gs_base); + preempt_enable(); + } + return vmx->msr_guest_kernel_gs_base; } +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + } + vmx->msr_guest_kernel_gs_base = data; +} +#endif + static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -3857,8 +3875,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - msr_info->data = vmx->msr_guest_kernel_gs_base; + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: @@ -3958,8 +3975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - vmx->msr_guest_kernel_gs_base = data; + vmx_write_guest_kernel_gs_base(vmx, data); break; #endif case MSR_IA32_SYSENTER_CS: @@ -4849,10 +4865,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) return; /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). + * MSR_KERNEL_GS_BASE is not intercepted when the guest is in + * 64-bit mode as a 64-bit kernel may frequently access the + * MSR. This means we need to manually save/restore the MSR + * when switching between guest and host state, but only if + * the guest is in 64-bit mode. Sync our cached value if the + * guest is transitioning to 32-bit mode and the CPU contains + * guest state, i.e. the cache is stale. */ - vmx_load_host_state(to_vmx(vcpu)); +#ifdef CONFIG_X86_64 + if (!(efer & EFER_LMA)) + (void)vmx_read_guest_kernel_gs_base(vmx); +#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); -- cgit v1.2.3 From 6d6095bd2c915f4c2ba6f25e7d16e38b06c904de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:44 -0700 Subject: KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state() Now that the vmx_load_host_state() wrapper is gone, i.e. the only time we call the core functions is when we're actually about to switch between guest/host, rename the functions that handle lazy state switching to vmx_prepare_switch_to_{guest,host}_state() to better document the full extent of their functionality. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 72c392525f61..ab27f9734ac0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2661,7 +2661,7 @@ static unsigned long segment_base(u16 selector) } #endif -static void vmx_save_host_state(struct kvm_vcpu *vcpu) +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); #ifdef CONFIG_X86_64 @@ -2735,7 +2735,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->guest_msrs[i].mask); } -static void __vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { if (!vmx->loaded_cpu_state) return; @@ -2938,7 +2938,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); - __vmx_load_host_state(to_vmx(vcpu)); + vmx_prepare_switch_to_host(to_vmx(vcpu)); } static bool emulation_required(struct kvm_vcpu *vcpu) @@ -6099,8 +6099,8 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in - * __vmx_load_host_state(), in case userspace uses the null selectors - * too (the expected case). + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); @@ -10565,9 +10565,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * - * We can't defer this to vmx_load_host_state() since that function - * may be executed in interrupt context, which saves and restore segments - * around it, nullifying its effect. + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); @@ -11668,7 +11668,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set host-state according to L0's settings (vmcs12 is irrelevant here) * Some constant fields are set here by vmx_set_constant_host_state(). * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() + * is called. */ vmx_set_constant_host_state(vmx); @@ -13707,7 +13708,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .prepare_guest_switch = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, -- cgit v1.2.3 From fd1ec7723fbd560f924769b43cbdfe82dfd6a98e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:45 -0700 Subject: KVM: nVMX: remove a misleading comment regarding vmcs02 fields prepare_vmcs02() has an odd comment that says certain fields are "not in vmcs02". AFAICT the intent of the comment is to document that various VMCS fields are not handled by prepare_vmcs02(), e.g. HOST_{FS,GS}_{BASE,SELECTOR}. While technically true, the comment is misleading, e.g. it can lead the reader to think that KVM never writes those fields to vmcs02. Remove the comment altogether as the handling of FS and GS is not specific to nested VMX, and GUEST_PML_INDEX has been written by prepare_vmcs02() since commit "4e59516a12a6 (kvm: vmx: ensure VMCS is current while enabling PML)" Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ab27f9734ac0..a135c91d44f8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11741,11 +11741,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - /* - * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, - * HOST_FS_BASE, HOST_GS_BASE. - */ - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); -- cgit v1.2.3 From e920de8507c6c8760c775cd718627e7cbf57c3b7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:46 -0700 Subject: KVM: vmx: compute need to reload FS/GS/LDT on demand Remove fs_reload_needed and gs_ldt_reload_needed from host_state and instead compute whether we need to reload various state at the time we actually do the reload. The state that is tracked by the *_reload_needed variables is not any more volatile than the trackers themselves. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a135c91d44f8..c8a583ff7bf2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -824,8 +824,6 @@ struct vcpu_vmx { #ifdef CONFIG_X86_64 u16 ds_sel, es_sel; #endif - int gs_ldt_reload_needed; - int fs_reload_needed; } host_state; struct { int vm86_active; @@ -2681,7 +2679,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * allow segment selectors with cpl > 0 or ti == 1. */ vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; #ifdef CONFIG_X86_64 savesegment(ds, vmx->host_state.ds_sel); @@ -2711,20 +2708,15 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx->host_state.fs_sel = fs_sel; - if (!(fs_sel & 7)) { + if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); - vmx->host_state.fs_reload_needed = 0; - } else { + else vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; - } vmx->host_state.gs_sel = gs_sel; if (!(gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else { + else vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; - } vmcs_writel(HOST_FS_BASE, fs_base); vmcs_writel(HOST_GS_BASE, gs_base); @@ -2749,7 +2741,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { + if (vmx->host_state.ldt_sel || (vmx->host_state.gs_sel & 7)) { kvm_load_ldt(vmx->host_state.ldt_sel); #ifdef CONFIG_X86_64 load_gs_index(vmx->host_state.gs_sel); @@ -2757,7 +2749,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) loadsegment(gs, vmx->host_state.gs_sel); #endif } - if (vmx->host_state.fs_reload_needed) + if (vmx->host_state.fs_sel & 7) loadsegment(fs, vmx->host_state.fs_sel); #ifdef CONFIG_X86_64 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { -- cgit v1.2.3 From d7ee039e2bab6341cdabf42d3036063cf91b40ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:47 -0700 Subject: KVM: vmx: move struct host_state usage to struct loaded_vmcs Make host_state a property of a loaded_vmcs so that it can be used as a cache of the VMCS fields, e.g. to lazily VMWRITE the corresponding VMCS field. Treating host_state as a cache does not work if it's not VMCS specific as the cache would become incoherent when switching between vmcs01 and vmcs02. Move vmcs_host_cr3 and vmcs_host_cr4 into host_state. Explicitly zero out host_state when allocating a new VMCS for a loaded_vmcs. Unlike the pre-existing vmcs_host_cr{3,4} usage, the segment information is not guaranteed to be (re)initialized when running a new nested VMCS, e.g. HOST_FS_BASE is not written in vmx_set_constant_host_state(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 72 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 28 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c8a583ff7bf2..d0d2374a3170 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -218,6 +218,21 @@ struct vmcs { char data[0]; }; +/* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + /* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs @@ -229,14 +244,13 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; }; struct shared_msr_entry { @@ -819,12 +833,6 @@ struct vcpu_vmx { struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; } msr_autoload; - struct { - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - } host_state; struct { int vm86_active; ulong save_rflags; @@ -2662,6 +2670,7 @@ static unsigned long segment_base(u16 selector) static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif @@ -2673,16 +2682,17 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) return; vmx->loaded_cpu_state = vmx->loaded_vmcs; + host_state = &vmx->loaded_cpu_state->host_state; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); + host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { @@ -2707,12 +2717,12 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx->host_state.fs_sel = fs_sel; + host_state->fs_sel = fs_sel; if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); else vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.gs_sel = gs_sel; + host_state->gs_sel = gs_sel; if (!(gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, gs_sel); else @@ -2729,10 +2739,13 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { + struct vmcs_host_state *host_state; + if (!vmx->loaded_cpu_state) return; WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + host_state = &vmx->loaded_cpu_state->host_state; ++vmx->vcpu.stat.host_state_reload; vmx->loaded_cpu_state = NULL; @@ -2741,20 +2754,20 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.ldt_sel || (vmx->host_state.gs_sel & 7)) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); + load_gs_index(host_state->gs_sel); #else - loadsegment(gs, vmx->host_state.gs_sel); + loadsegment(gs, host_state->gs_sel); #endif } - if (vmx->host_state.fs_sel & 7) - loadsegment(fs, vmx->host_state.fs_sel); + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 - if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { - loadsegment(ds, vmx->host_state.ds_sel); - loadsegment(es, vmx->host_state.es_sel); + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); @@ -4574,6 +4587,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) evmcs->hv_enlightenments_control.msr_bitmap = 1; } } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + return 0; out_vmcs: @@ -6080,12 +6096,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -10356,15 +10372,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the -- cgit v1.2.3 From f3bbc0dcedf50d29d2d6fdfb16c5ec3e2368837f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:48 -0700 Subject: KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup The HOST_{FS,GS}_BASE fields are guaranteed to be written prior to VMENTER, by way of vmx_prepare_switch_to_guest(). Initialize the fields to zero for 64-bit kernels instead of pulling the base values from their respective MSRs. In addition to eliminating two RDMSRs, vmx_prepare_switch_to_guest() can safely assume the initial value of the fields is zero in all cases. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d0d2374a3170..60bf9a95219f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6361,9 +6361,6 @@ static void ept_set_mmio_spte_mask(void) */ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { -#ifdef CONFIG_X86_64 - unsigned long a; -#endif int i; if (enable_shadow_vmcs) { @@ -6418,15 +6415,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); -- cgit v1.2.3 From 8f21a0bbf36f58334695ae6e46c0cf906a217154 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:49 -0700 Subject: KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible On a 64-bit host, FS.sel and GS.sel are all but guaranteed to be 0, which in turn means they'll rarely change. Skip the VMWRITE for the associated VMCS fields when loading host state if the selector hasn't changed since the last VMWRITE. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 60bf9a95219f..4937e11f971a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2717,16 +2717,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - host_state->fs_sel = fs_sel; - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else - vmcs_write16(HOST_FS_SELECTOR, 0); - host_state->gs_sel = gs_sel; - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else - vmcs_write16(HOST_GS_SELECTOR, 0); + if (unlikely(fs_sel != host_state->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host_state->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host_state->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host_state->gs_sel = gs_sel; + } vmcs_writel(HOST_FS_BASE, fs_base); vmcs_writel(HOST_GS_BASE, gs_base); -- cgit v1.2.3 From 5e079c7ece1060491ef4a45414823162cce91b3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:50 -0700 Subject: KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible The host's FS.base and GS.base rarely change, e.g. ~0.1% of host/guest swaps on my system. Cache the last value written to the VMCS and skip the VMWRITE to the associated VMCS fields when loading host state if the value hasn't changed since the last VMWRITE. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4937e11f971a..b23ecc04ca51 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -226,6 +226,8 @@ struct vmcs { struct vmcs_host_state { unsigned long cr3; /* May not match real cr3 */ unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 @@ -2731,9 +2733,14 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmcs_write16(HOST_GS_SELECTOR, 0); host_state->gs_sel = gs_sel; } - - vmcs_writel(HOST_FS_BASE, fs_base); - vmcs_writel(HOST_GS_BASE, gs_base); + if (unlikely(fs_base != host_state->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host_state->fs_base = fs_base; + } + if (unlikely(gs_base != host_state->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host_state->gs_base = gs_base; + } for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, -- cgit v1.2.3 From fd8ca6dac9b45db8503cf508880edd63e039e2f2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 6 Aug 2018 16:42:49 +0200 Subject: KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in arch/x86/kvm/vmx.c. Signed-off-by: Uros Bizjak [Mark error paths as unlikely while touching this. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86/kvm/vmx.c') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b23ecc04ca51..16f9373c01de 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -38,6 +38,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include #include #include #include @@ -1916,11 +1917,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; + bool error; - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) + : CC_OUT(na) (error) : "a"(&operand), "c"(ext) + : "memory"); + BUG_ON(error); } static inline void __invept(int ext, u64 eptp, gpa_t gpa) @@ -1928,11 +1930,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; + bool error; - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) + : CC_OUT(na) (error) : "a" (&operand), "c" (ext) + : "memory"); + BUG_ON(error); } static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) @@ -1948,12 +1951,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); } @@ -1970,15 +1973,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); } @@ -2203,10 +2206,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) + : CC_OUT(na) (error) : "a"(value), "d"(field)); if (unlikely(error)) vmwrite_error(field, value); } -- cgit v1.2.3