diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-22 00:31:43 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-22 00:31:43 +0300 |
commit | 3e10585335b7967326ca7b4118cada0d2d00a2ab (patch) | |
tree | e1655bc4f093f7de3a54dc3b2d83a54159aca10b | |
parent | 9c5b80b795e9c847a7b7f5e63c6bcf07873fbcdf (diff) | |
parent | 8c6e67bec3192f16fa624203c8131e10cc4814ba (diff) | |
download | linux-3e10585335b7967326ca7b4118cada0d2d00a2ab.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini:
"x86:
- Support for userspace to emulate Xen hypercalls
- Raise the maximum number of user memslots
- Scalability improvements for the new MMU.
Instead of the complex "fast page fault" logic that is used in
mmu.c, tdp_mmu.c uses an rwlock so that page faults are concurrent,
but the code that can run against page faults is limited. Right now
only page faults take the lock for reading; in the future this will
be extended to some cases of page table destruction. I hope to
switch the default MMU around 5.12-rc3 (some testing was delayed
due to Chinese New Year).
- Cleanups for MAXPHYADDR checks
- Use static calls for vendor-specific callbacks
- On AMD, use VMLOAD/VMSAVE to save and restore host state
- Stop using deprecated jump label APIs
- Workaround for AMD erratum that made nested virtualization
unreliable
- Support for LBR emulation in the guest
- Support for communicating bus lock vmexits to userspace
- Add support for SEV attestation command
- Miscellaneous cleanups
PPC:
- Support for second data watchpoint on POWER10
- Remove some complex workarounds for buggy early versions of POWER9
- Guest entry/exit fixes
ARM64:
- Make the nVHE EL2 object relocatable
- Cleanups for concurrent translation faults hitting the same page
- Support for the standard TRNG hypervisor call
- A bunch of small PMU/Debug fixes
- Simplification of the early init hypercall handling
Non-KVM changes (with acks):
- Detection of contended rwlocks (implemented only for qrwlocks,
because KVM only needs it for x86)
- Allow __DISABLE_EXPORTS from assembly code
- Provide a saner follow_pfn replacements for modules"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (192 commits)
KVM: x86/xen: Explicitly pad struct compat_vcpu_info to 64 bytes
KVM: selftests: Don't bother mapping GVA for Xen shinfo test
KVM: selftests: Fix hex vs. decimal snafu in Xen test
KVM: selftests: Fix size of memslots created by Xen tests
KVM: selftests: Ignore recently added Xen tests' build output
KVM: selftests: Add missing header file needed by xAPIC IPI tests
KVM: selftests: Add operand to vmsave/vmload/vmrun in svm.c
KVM: SVM: Make symbol 'svm_gp_erratum_intercept' static
locking/arch: Move qrwlock.h include after qspinlock.h
KVM: PPC: Book3S HV: Fix host radix SLB optimisation with hash guests
KVM: PPC: Book3S HV: Ensure radix guest has no SLB entries
KVM: PPC: Don't always report hash MMU capability for P9 < DD2.2
KVM: PPC: Book3S HV: Save and restore FSCR in the P9 path
KVM: PPC: remove unneeded semicolon
KVM: PPC: Book3S HV: Use POWER9 SLBIA IH=6 variant to clear SLB
KVM: PPC: Book3S HV: No need to clear radix host SLB before loading HPT guest
KVM: PPC: Book3S HV: Fix radix guest SLB side channel
KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support
KVM: PPC: Book3S HV: Introduce new capability for 2nd DAWR
KVM: PPC: Book3S HV: Add infrastructure to support 2nd DAWR
...
150 files changed, 6655 insertions, 2060 deletions
diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index 09a8f2a34e39..469a6308765b 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -263,6 +263,27 @@ Returns: 0 on success, -negative on error __u32 trans_len; }; +10. KVM_SEV_GET_ATTESTATION_REPORT +---------------------------------- + +The KVM_SEV_GET_ATTESTATION_REPORT command can be used by the hypervisor to query the attestation +report containing the SHA-256 digest of the guest memory and VMSA passed through the KVM_SEV_LAUNCH +commands and signed with the PEK. The digest returned by the command should match the digest +used by the guest owner with the KVM_SEV_LAUNCH_MEASURE. + +Parameters (in): struct kvm_sev_attestation + +Returns: 0 on success, -negative on error + +:: + + struct kvm_sev_attestation_report { + __u8 mnonce[16]; /* A random mnonce that will be placed in the report */ + + __u64 uaddr; /* userspace address where the report should be copied */ + __u32 len; + }; + References ========== diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 99ceb978c8b0..45fd862ac128 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -960,6 +960,14 @@ memory. __u8 pad2[30]; }; +If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the +KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl. +This requests KVM to generate the contents of the hypercall page +automatically; hypercalls will be intercepted and passed to userspace +through KVM_EXIT_XEN. In this case, all of the blob size and address +fields must be zero. + +No other flags are currently valid in the struct kvm_xen_hvm_config. 4.29 KVM_GET_CLOCK ------------------ @@ -2268,6 +2276,8 @@ registers, find a list below: PPC KVM_REG_PPC_PSSCR 64 PPC KVM_REG_PPC_DEC_EXPIRY 64 PPC KVM_REG_PPC_PTCR 64 + PPC KVM_REG_PPC_DAWR1 64 + PPC KVM_REG_PPC_DAWRX1 64 PPC KVM_REG_PPC_TM_GPR0 64 ... PPC KVM_REG_PPC_TM_GPR31 64 @@ -4831,6 +4841,101 @@ into user space. If a vCPU is in running state while this ioctl is invoked, the vCPU may experience inconsistent filtering behavior on MSR accesses. +4.127 KVM_XEN_HVM_SET_ATTR +-------------------------- + +:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_xen_hvm_attr +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + struct { + __u64 gfn; + } shared_info; + __u64 pad[4]; + } u; + }; + +type values: + +KVM_XEN_ATTR_TYPE_LONG_MODE + Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This + determines the layout of the shared info pages exposed to the VM. + +KVM_XEN_ATTR_TYPE_SHARED_INFO + Sets the guest physical frame number at which the Xen "shared info" + page resides. Note that although Xen places vcpu_info for the first + 32 vCPUs in the shared_info page, KVM does not automatically do so + and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used + explicitly even when the vcpu_info for a given vCPU resides at the + "default" location in the shared_info page. This is because KVM is + not aware of the Xen CPU id which is used as the index into the + vcpu_info[] array, so cannot know the correct default location. + +KVM_XEN_ATTR_TYPE_UPCALL_VECTOR + Sets the exception vector used to deliver Xen event channel upcalls. + +4.128 KVM_XEN_HVM_GET_ATTR +-------------------------- + +:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_xen_hvm_attr +:Returns: 0 on success, < 0 on error + +Allows Xen VM attributes to be read. For the structure and types, +see KVM_XEN_HVM_SET_ATTR above. + +4.129 KVM_XEN_VCPU_SET_ATTR +--------------------------- + +:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xen_vcpu_attr +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; + __u64 pad[4]; + } u; + }; + +type values: + +KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO + Sets the guest physical address of the vcpu_info for a given vCPU. + +KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO + Sets the guest physical address of an additional pvclock structure + for a given vCPU. This is typically used for guest vsyscall support. + +4.130 KVM_XEN_VCPU_GET_ATTR +--------------------------- + +:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xen_vcpu_attr +:Returns: 0 on success, < 0 on error + +Allows Xen vCPU attributes to be read. For the structure and types, +see KVM_XEN_VCPU_SET_ATTR above. 5. The kvm_run structure ======================== @@ -4893,9 +4998,11 @@ local APIC is not used. __u16 flags; More architecture-specific flags detailing state of the VCPU that may -affect the device's behavior. The only currently defined flag is -KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the -VCPU is in system management mode. +affect the device's behavior. Current defined flags: + /* x86, set if the VCPU is in system management mode */ + #define KVM_RUN_X86_SMM (1 << 0) + /* x86, set if bus lock detected in VM */ + #define KVM_RUN_BUS_LOCK (1 << 1) :: @@ -4996,13 +5103,18 @@ to the byte array. .. note:: - For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, + For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish - incomplete operations and then check for pending signals. Userspace - can re-enter the guest with an unmasked signal pending to complete - pending operations. + incomplete operations and then check for pending signals. + + The pending state of the operation is not preserved in state which is + visible to userspace, thus userspace should ensure that the operation is + completed before performing a live migration. Userspace can re-enter the + guest with an unmasked signal pending or with the immediate_exit field set + to complete pending operations without allowing any further instructions + to be executed. :: @@ -5329,6 +5441,34 @@ vCPU execution. If the MSR write was unsuccessful, user space also sets the :: + + struct kvm_xen_exit { + #define KVM_EXIT_XEN_HCALL 1 + __u32 type; + union { + struct { + __u32 longmode; + __u32 cpl; + __u64 input; + __u64 result; + __u64 params[6]; + } hcall; + } u; + }; + /* KVM_EXIT_XEN */ + struct kvm_hyperv_exit xen; + +Indicates that the VCPU exits into userspace to process some tasks +related to Xen emulation. + +Valid values for 'type' are: + + - KVM_EXIT_XEN_HCALL -- synchronously notify user-space about Xen hypercall. + Userspace is expected to place the hypercall result into the appropriate + field before invoking KVM_RUN again. + +:: + /* Fix the size of the union. */ char padding[256]; }; @@ -6038,6 +6178,53 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space can then handle to implement model specific MSR handling and/or user notifications to inform a user that an MSR was not handled. +7.22 KVM_CAP_X86_BUS_LOCK_EXIT +------------------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] defines the policy used when bus locks detected in guest +:Returns: 0 on success, -EINVAL when args[0] contains invalid bits + +Valid bits in args[0] are:: + + #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) + #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) + +Enabling this capability on a VM provides userspace with a way to select +a policy to handle the bus locks detected in guest. Userspace can obtain +the supported modes from the result of KVM_CHECK_EXTENSION and define it +through the KVM_ENABLE_CAP. + +KVM_BUS_LOCK_DETECTION_OFF and KVM_BUS_LOCK_DETECTION_EXIT are supported +currently and mutually exclusive with each other. More bits can be added in +the future. + +With KVM_BUS_LOCK_DETECTION_OFF set, bus locks in guest will not cause vm exits +so that no additional actions are needed. This is the default mode. + +With KVM_BUS_LOCK_DETECTION_EXIT set, vm exits happen when bus lock detected +in VM. KVM just exits to userspace when handling them. Userspace can enforce +its own throttling or other policy based mitigations. + +This capability is aimed to address the thread that VM can exploit bus locks to +degree the performance of the whole system. Once the userspace enable this +capability and select the KVM_BUS_LOCK_DETECTION_EXIT mode, KVM will set the +KVM_RUN_BUS_LOCK flag in vcpu-run->flags field and exit to userspace. Concerning +the bus lock vm exit can be preempted by a higher priority VM exit, the exit +notifications to userspace can be KVM_EXIT_BUS_LOCK or other reasons. +KVM_RUN_BUS_LOCK flag is used to distinguish between them. + +7.22 KVM_CAP_PPC_DAWR1 +---------------------- + +:Architectures: ppc +:Parameters: none +:Returns: 0 on success, -EINVAL when CPU doesn't support 2nd DAWR + +This capability can be used to check / enable 2nd DAWR feature provided +by POWER10 processor. + 8. Other capabilities. ====================== @@ -6415,7 +6602,6 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf (0x40000001). Otherwise, a guest may use the paravirtual features regardless of what has actually been exposed through the CPUID leaf. - 8.29 KVM_CAP_DIRTY_LOG_RING --------------------------- @@ -6502,3 +6688,29 @@ KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG. After enabling KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual machine will switch to ring-buffer dirty page tracking and further KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail. + +8.30 KVM_CAP_XEN_HVM +-------------------- + +:Architectures: x86 + +This capability indicates the features that Xen supports for hosting Xen +PVHVM guests. Valid flags are:: + + #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) + #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) + #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) + +The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG +ioctl is available, for the guest to set its hypercall page. + +If KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL is also set, the same flag may also be +provided in the flags to KVM_XEN_HVM_CONFIG, without providing hypercall page +contents, to request that KVM generate hypercall page content automatically +and also enable interception of guest hypercalls with KVM_EXIT_XEN. + +The KVM_XEN_HVM_CONFIG_SHARED_INFO flag indicates the availability of the +KVM_XEN_HVM_SET_ATTR, KVM_XEN_HVM_GET_ATTR, KVM_XEN_VCPU_SET_ATTR and +KVM_XEN_VCPU_GET_ATTR ioctls, as well as the delivery of exception vectors +for event channel upcalls when the evtchn_upcall_pending field of a vcpu's +vcpu_info is set. diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index b21a34c34a21..0aa4817b466d 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows: - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring them together is quite rare. -On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. +On x86: + +- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock + +- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is + taken inside kvm->arch.mmu_lock, and cannot be taken without already + holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise + there's no need to take kvm->arch.tdp_mmu_pages_lock at all). Everything else is a leaf: no other lock is taken inside the critical sections. diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h index daa1a1da539e..737ded6b6d0d 100644 --- a/arch/arm64/include/asm/hyp_image.h +++ b/arch/arm64/include/asm/hyp_image.h @@ -7,6 +7,9 @@ #ifndef __ARM64_HYP_IMAGE_H__ #define __ARM64_HYP_IMAGE_H__ +#define __HYP_CONCAT(a, b) a ## b +#define HYP_CONCAT(a, b) __HYP_CONCAT(a, b) + /* * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, * to separate it from the kernel proper. @@ -21,9 +24,31 @@ */ #define HYP_SECTION_NAME(NAME) .hyp##NAME +/* Symbol defined at the beginning of each hyp section. */ +#define HYP_SECTION_SYMBOL_NAME(NAME) \ + HYP_CONCAT(__hyp_section_, HYP_SECTION_NAME(NAME)) + +/* + * Helper to generate linker script statements starting a hyp section. + * + * A symbol with a well-known name is defined at the first byte. This + * is used as a base for hyp relocations (see gen-hyprel.c). It must + * be defined inside the section so the linker of `vmlinux` cannot + * separate it from the section data. + */ +#define BEGIN_HYP_SECTION(NAME) \ + HYP_SECTION_NAME(NAME) : { \ + HYP_SECTION_SYMBOL_NAME(NAME) = .; + +/* Helper to generate linker script statements ending a hyp section. */ +#define END_HYP_SECTION \ + } + /* Defines an ELF hyp section from input section @NAME and its subsections. */ -#define HYP_SECTION(NAME) \ - HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) } +#define HYP_SECTION(NAME) \ + BEGIN_HYP_SECTION(NAME) \ + *(NAME NAME##.*) \ + END_HYP_SECTION /* * Defines a linker script alias of a kernel-proper symbol referenced by diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7ccf770c53d9..22d933e9b59e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -199,26 +199,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -/* - * Obtain the PC-relative address of a kernel symbol - * s: symbol - * - * The goal of this macro is to return a symbol's address based on a - * PC-relative computation, as opposed to a loading the VA from a - * constant pool or something similar. This works well for HYP, as an - * absolute VA is guaranteed to be wrong. Only use this if trying to - * obtain the address of a symbol (i.e. not something you obtained by - * following a pointer). - */ -#define hyp_symbol_addr(s) \ - ({ \ - typeof(s) *addr; \ - asm("adrp %0, %1\n" \ - "add %0, %0, :lo12:%1\n" \ - : "=r" (addr) : "S" (&s)); \ - addr; \ - }) - #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ " .align 3\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8fcfab0c2567..3d10e6527f7d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -30,7 +30,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED -#define KVM_USER_MEM_SLOTS 512 #define KVM_HALT_POLL_NS_DEFAULT 500000 #include <kvm/arm_vgic.h> @@ -771,4 +770,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_vcpu_has_pmu(vcpu) \ (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) +int kvm_trng_call(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e52d82aeadca..90873851f677 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -73,49 +73,39 @@ alternative_cb_end .endm /* - * Convert a kernel image address to a PA - * reg: kernel address to be converted in place + * Convert a hypervisor VA to a PA + * reg: hypervisor address to be converted in place * tmp: temporary register - * - * The actual code generation takes place in kvm_get_kimage_voffset, and - * the instructions below are only there to reserve the space and - * perform the register allocation (kvm_get_kimage_voffset uses the - * specific registers encoded in the instructions). */ -.macro kimg_pa reg, tmp -alternative_cb kvm_get_kimage_voffset - movz \tmp, #0 - movk \tmp, #0, lsl #16 - movk \tmp, #0, lsl #32 - movk \tmp, #0, lsl #48 -alternative_cb_end - - /* reg = __pa(reg) */ - sub \reg, \reg, \tmp +.macro hyp_pa reg, tmp + ldr_l \tmp, hyp_physvirt_offset + add \reg, \reg, \tmp .endm /* - * Convert a kernel image address to a hyp VA - * reg: kernel address to be converted in place + * Convert a hypervisor VA to a kernel image address + * reg: hypervisor address to be converted in place * tmp: temporary register * * The actual code generation takes place in kvm_get_kimage_voffset, and * the instructions below are only there to reserve the space and - * perform the register allocation (kvm_update_kimg_phys_offset uses the + * perform the register allocation (kvm_get_kimage_voffset uses the * specific registers encoded in the instructions). */ -.macro kimg_hyp_va reg, tmp -alternative_cb kvm_update_kimg_phys_offset +.macro hyp_kimg_va reg, tmp + /* Convert hyp VA -> PA. */ + hyp_pa \reg, \tmp + + /* Load kimage_voffset. */ +alternative_cb kvm_get_kimage_voffset movz \tmp, #0 movk \tmp, #0, lsl #16 movk \tmp, #0, lsl #32 movk \tmp, #0, lsl #48 alternative_cb_end - sub \reg, \reg, \tmp - mov_q \tmp, PAGE_OFFSET - orr \reg, \reg, \tmp - kern_hyp_va \reg + /* Convert PA -> kimg VA. */ + add \reg, \reg, \tmp .endm #else @@ -129,6 +119,7 @@ alternative_cb_end void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void kvm_compute_layout(void); +void kvm_apply_hyp_relocations(void); static __always_inline unsigned long __kern_hyp_va(unsigned long v) { @@ -144,24 +135,6 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) -static __always_inline unsigned long __kimg_hyp_va(unsigned long v) -{ - unsigned long offset; - - asm volatile(ALTERNATIVE_CB("movz %0, #0\n" - "movk %0, #0, lsl #16\n" - "movk %0, #0, lsl #32\n" - "movk %0, #0, lsl #48\n", - kvm_update_kimg_phys_offset) - : "=r" (offset)); - - return __kern_hyp_va((v - offset) | PAGE_OFFSET); -} - -#define kimg_fn_hyp_va(v) ((typeof(*v))(__kimg_hyp_va((unsigned long)(v)))) - -#define kimg_fn_ptr(x) (typeof(x) **)(x) - /* * We currently support using a VM-specified IPA size. For backward * compatibility, the default IPA size is fixed to 40bits. diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 52ab38db04c7..8886d43cfb11 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -157,6 +157,11 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. * + * Note that the update of a valid leaf PTE in this function will be aborted, + * if it's trying to recreate the exact same mapping or only change the access + * permissions. Instead, the vCPU will exit one more time from guest if still + * needed and then go through the path of relaxing permissions. + * * Note that this function will both coalesce existing table entries and split * existing block mappings, relying on page-faults to fault back areas outside * of the new mapping lazily. diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 8ff579361731..2f36b16a5b5d 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -11,7 +11,8 @@ extern char __alt_instructions[], __alt_instructions_end[]; extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; extern char __hyp_text_start[], __hyp_text_end[]; -extern char __hyp_data_ro_after_init_start[], __hyp_data_ro_after_init_end[]; +extern char __hyp_rodata_start[], __hyp_rodata_end[]; +extern char __hyp_reloc_begin[], __hyp_reloc_end[]; extern char __idmap_text_start[], __idmap_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __inittext_begin[], __inittext_end[]; diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index 9083d6992603..0525c0b089ed 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -5,8 +5,8 @@ #ifndef __ASM_SPINLOCK_H #define __ASM_SPINLOCK_H -#include <asm/qrwlock.h> #include <asm/qspinlock.h> +#include <asm/qrwlock.h> /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 767bb2d47be9..f9fbbb4734e8 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -853,7 +853,10 @@ #define ID_DFR0_PERFMON_SHIFT 24 +#define ID_DFR0_PERFMON_8_0 0x3 #define ID_DFR0_PERFMON_8_1 0x4 +#define ID_DFR0_PERFMON_8_4 0x5 +#define ID_DFR0_PERFMON_8_5 0x6 #define ID_ISAR4_SWP_FRAC_SHIFT 28 #define ID_ISAR4_PSR_M_SHIFT 24 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index f676243abac6..23f1a557bd9f 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -64,7 +64,6 @@ __efistub__ctype = _ctype; /* Alternative callbacks for init-time patching of nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); -KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset); KVM_NVHE_ALIAS(kvm_get_kimage_voffset); /* Global kernel state accessed by nVHE hyp code. */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index ad00f99ee9b0..357590beaabb 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -434,8 +434,10 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); - if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) + if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) { kvm_compute_layout(); + kvm_apply_hyp_relocations(); + } } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 68f76a96c60b..7eea7888bb02 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -31,10 +31,11 @@ jiffies = jiffies_64; __stop___kvm_ex_table = .; #define HYPERVISOR_DATA_SECTIONS \ - HYP_SECTION_NAME(.data..ro_after_init) : { \ - __hyp_data_ro_after_init_start = .; \ + HYP_SECTION_NAME(.rodata) : { \ + __hyp_rodata_start = .; \ *(HYP_SECTION_NAME(.data..ro_after_init)) \ - __hyp_data_ro_after_init_end = .; \ + *(HYP_SECTION_NAME(.rodata)) \ + __hyp_rodata_end = .; \ } #define HYPERVISOR_PERCPU_SECTION \ @@ -42,10 +43,19 @@ jiffies = jiffies_64; HYP_SECTION_NAME(.data..percpu) : { \ *(HYP_SECTION_NAME(.data..percpu)) \ } + +#define HYPERVISOR_RELOC_SECTION \ + .hyp.reloc : ALIGN(4) { \ + __hyp_reloc_begin = .; \ + *(.hyp.reloc) \ + __hyp_reloc_end = .; \ + } + #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE #define HYPERVISOR_DATA_SECTIONS #define HYPERVISOR_PERCPU_SECTION +#define HYPERVISOR_RELOC_SECTION #endif #define HYPERVISOR_TEXT \ @@ -216,6 +226,8 @@ SECTIONS PERCPU_SECTION(L1_CACHE_BYTES) HYPERVISOR_PERCPU_SECTION + HYPERVISOR_RELOC_SECTION + .rela.dyn : ALIGN(8) { *(.rela .rela*) } diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 13b017284bf9..589921392cb1 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ - arch_timer.o \ + arch_timer.o trng.o\ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6804c5673312..fc4c95dd2d26 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1750,11 +1750,10 @@ static int init_hyp_mode(void) goto out_err; } - err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_ro_after_init_start), - kvm_ksym_ref(__hyp_data_ro_after_init_end), - PAGE_HYP_RO); + err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), + kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); if (err) { - kvm_err("Cannot map .hyp.data..ro_after_init section\n"); + kvm_err("Cannot map .hyp.rodata section\n"); goto out_err; } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 84473574c2e7..54f4860cd87c 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -505,8 +505,8 @@ static inline void __kvm_unexpected_el2_exception(void) struct exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); - entry = hyp_symbol_addr(__start___kvm_ex_table); - end = hyp_symbol_addr(__stop___kvm_ex_table); + entry = &__start___kvm_ex_table; + end = &__stop___kvm_ex_table; while (entry < end) { addr = (unsigned long)&entry->insn + entry->insn; diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore index 695d73d0249e..5b6c43cc96f8 100644 --- a/arch/arm64/kvm/hyp/nvhe/.gitignore +++ b/arch/arm64/kvm/hyp/nvhe/.gitignore @@ -1,2 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only +gen-hyprel hyp.lds +hyp-reloc.S diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 1f1e351c5fe2..a6707df4f6c0 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -3,8 +3,11 @@ # Makefile for Kernel-based Virtual Machine module, HYP/nVHE part # -asflags-y := -D__KVM_NVHE_HYPERVISOR__ -ccflags-y := -D__KVM_NVHE_HYPERVISOR__ +asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS +ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS + +hostprogs := gen-hyprel +HOST_EXTRACFLAGS += -I$(objtree)/include obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o @@ -19,7 +22,7 @@ obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) obj-y := kvm_nvhe.o -extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds +extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o # 1) Compile all source files to `.nvhe.o` object files. The file extension # avoids file name clashes for files shared with VHE. @@ -42,11 +45,31 @@ LDFLAGS_kvm_nvhe.tmp.o := -r -T $(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE $(call if_changed,ld) -# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. +# 4) Generate list of hyp code/data positions that need to be relocated at +# runtime. Because the hypervisor is part of the kernel binary, relocations +# produce a kernel VA. We enumerate relocations targeting hyp at build time +# and convert the kernel VAs at those positions to hyp VAs. +$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel + $(call if_changed,hyprel) + +# 5) Compile hyp-reloc.S and link it into the existing partially linked object. +# The object file now contains a section with pointers to hyp positions that +# will contain kernel VAs at runtime. These pointers have relocations on them +# so that they get updated as the hyp object is linked into `vmlinux`. +LDFLAGS_kvm_nvhe.rel.o := -r +$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE + $(call if_changed,ld) + +# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. # Prefixes names of ELF symbols with '__kvm_nvhe_'. -$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE +$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE $(call if_changed,hypcopy) +# The HYPREL command calls `gen-hyprel` to generate an assembly file with +# a list of relocations targeting hyp code/data. +quiet_cmd_hyprel = HYPREL $@ + cmd_hyprel = $(obj)/gen-hyprel $< > $@ + # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names # to avoid clashes with VHE code/data. quiet_cmd_hypcopy = HYPCOPY $@ diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c new file mode 100644 index 000000000000..ead02c6a7628 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google LLC + * Author: David Brazdil <dbrazdil@google.com> + * + * Generates relocation information used by the kernel to convert + * absolute addresses in hyp data from kernel VAs to hyp VAs. + * + * This is necessary because hyp code is linked into the same binary + * as the kernel but executes under different memory mappings. + * If the compiler used absolute addressing, those addresses need to + * be converted before they are used by hyp code. + * + * The input of this program is the relocatable ELF object containing + * all hyp code/data, not yet linked into vmlinux. Hyp section names + * should have been prefixed with `.hyp` at this point. + * + * The output (printed to stdout) is an assembly file containing + * an array of 32-bit integers and static relocations that instruct + * the linker of `vmlinux` to populate the array entries with offsets + * to positions in the kernel binary containing VAs used by hyp code. + * + * Note that dynamic relocations could be used for the same purpose. + * However, those are only generated if CONFIG_RELOCATABLE=y. + */ + +#include <elf.h> +#include <endian.h> +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <generated/autoconf.h> + +#define HYP_SECTION_PREFIX ".hyp" +#define HYP_RELOC_SECTION ".hyp.reloc" +#define HYP_SECTION_SYMBOL_PREFIX "__hyp_section_" + +/* + * AArch64 relocation type constants. + * Included in case these are not defined in the host toolchain. + */ +#ifndef R_AARCH64_ABS64 +#define R_AARCH64_ABS64 257 +#endif +#ifndef R_AARCH64_LD_PREL_LO19 +#define R_AARCH64_LD_PREL_LO19 273 +#endif +#ifndef R_AARCH64_ADR_PREL_LO21 +#define R_AARCH64_ADR_PREL_LO21 274 +#endif +#ifndef R_AARCH64_ADR_PREL_PG_HI21 +#define R_AARCH64_ADR_PREL_PG_HI21 275 +#endif +#ifndef R_AARCH64_ADR_PREL_PG_HI21_NC +#define R_AARCH64_ADR_PREL_PG_HI21_NC 276 +#endif +#ifndef R_AARCH64_ADD_ABS_LO12_NC +#define R_AARCH64_ADD_ABS_LO12_NC 277 +#endif +#ifndef R_AARCH64_LDST8_ABS_LO12_NC +#define R_AARCH64_LDST8_ABS_LO12_NC 278 +#endif +#ifndef R_AARCH64_TSTBR14 +#define R_AARCH64_TSTBR14 279 +#endif +#ifndef R_AARCH64_CONDBR19 +#define R_AARCH64_CONDBR19 280 +#endif +#ifndef R_AARCH64_JUMP26 +#define R_AARCH64_JUMP26 282 +#endif +#ifndef R_AARCH64_CALL26 +#define R_AARCH64_CALL26 283 +#endif +#ifndef R_AARCH64_LDST16_ABS_LO12_NC +#define R_AARCH64_LDST16_ABS_LO12_NC 284 +#endif +#ifndef R_AARCH64_LDST32_ABS_LO12_NC +#define R_AARCH64_LDST32_ABS_LO12_NC 285 +#endif +#ifndef R_AARCH64_LDST64_ABS_LO12_NC +#define R_AARCH64_LDST64_ABS_LO12_NC 286 +#endif +#ifndef R_AARCH64_MOVW_PREL_G0 +#define R_AARCH64_MOVW_PREL_G0 287 +#endif +#ifndef R_AARCH64_MOVW_PREL_G0_NC +#define R_AARCH64_MOVW_PREL_G0_NC 288 +#endif +#ifndef R_AARCH64_MOVW_PREL_G1 +#define R_AARCH64_MOVW_PREL_G1 289 +#endif +#ifndef R_AARCH64_MOVW_PREL_G1_NC +#define R_AARCH64_MOVW_PREL_G1_NC 290 +#endif +#ifndef R_AARCH64_MOVW_PREL_G2 +#define R_AARCH64_MOVW_PREL_G2 291 +#endif +#ifndef R_AARCH64_MOVW_PREL_G2_NC +#define R_AARCH64_MOVW_PREL_G2_NC 292 +#endif +#ifndef R_AARCH64_MOVW_PREL_G3 +#define R_AARCH64_MOVW_PREL_G3 293 +#endif +#ifndef R_AARCH64_LDST128_ABS_LO12_NC +#define R_AARCH64_LDST128_ABS_LO12_NC 299 +#endif + +/* Global state of the processed ELF. */ +static struct { + const char *path; + char *begin; + size_t size; + Elf64_Ehdr *ehdr; + Elf64_Shdr *sh_table; + const char *sh_string; +} elf; + +#if defined(CONFIG_CPU_LITTLE_ENDIAN) + +#define elf16toh(x) le16toh(x) +#define elf32toh(x) le32toh(x) +#define elf64toh(x) le64toh(x) + +#define ELFENDIAN ELFDATA2LSB + +#elif defined(CONFIG_CPU_BIG_ENDIAN) + +#define elf16toh(x) be16toh(x) +#define elf32toh(x) be32toh(x) +#define elf64toh(x) be64toh(x) + +#define ELFENDIAN ELFDATA2MSB + +#else + +#error PDP-endian sadly unsupported... + +#endif + +#define fatal_error(fmt, ...) \ + ({ \ + fprintf(stderr, "error: %s: " fmt "\n", \ + elf.path, ## __VA_ARGS__); \ + exit(EXIT_FAILURE); \ + __builtin_unreachable(); \ + }) + +#define fatal_perror(msg) \ + ({ \ + fprintf(stderr, "error: %s: " msg ": %s\n", \ + elf.path, strerror(errno)); \ + exit(EXIT_FAILURE); \ + __builtin_unreachable(); \ + }) + +#define assert_op(lhs, rhs, fmt, op) \ + ({ \ + typeof(lhs) _lhs = (lhs); \ + typeof(rhs) _rhs = (rhs); \ + \ + if (!(_lhs op _rhs)) { \ + fatal_error("assertion " #lhs " " #op " " #rhs \ + " failed (lhs=" fmt ", rhs=" fmt \ + ", line=%d)", _lhs, _rhs, __LINE__); \ + } \ + }) + +#define assert_eq(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, ==) +#define assert_ne(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, !=) +#define assert_lt(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, <) +#define assert_ge(lhs, rhs, fmt) assert_op(lhs, rhs, fmt, >=) + +/* + * Return a pointer of a given type at a given offset from + * the beginning of the ELF file. + */ +#define elf_ptr(type, off) ((type *)(elf.begin + (off))) + +/* Iterate over all sections in the ELF. */ +#define for_each_section(var) \ + for (var = elf.sh_table; var < elf.sh_table + elf16toh(elf.ehdr->e_shnum); ++var) + +/* Iterate over all Elf64_Rela relocations in a given section. */ +#define for_each_rela(shdr, var) \ + for (var = elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset)); \ + var < elf_ptr(Elf64_Rela, elf64toh(shdr->sh_offset) + elf64toh(shdr->sh_size)); var++) + +/* True if a string starts with a given prefix. */ +static inline bool starts_with(const char *str, const char *prefix) +{ + return memcmp(str, prefix, strlen(prefix)) == 0; +} + +/* Returns a string containing the name of a given section. */ +static inline const char *section_name(Elf64_Shdr *shdr) +{ + return elf.sh_string + elf32toh(shdr->sh_name); +} + +/* Returns a pointer to the first byte of section data. */ +static inline const char *section_begin(Elf64_Shdr *shdr) +{ + return elf_ptr(char, elf64toh(shdr->sh_offset)); +} + +/* Find a section by its offset from the beginning of the file. */ +static inline Elf64_Shdr *section_by_off(Elf64_Off off) +{ + assert_ne(off, 0UL, "%lu"); + return elf_ptr(Elf64_Shdr, off); +} + +/* Find a section by its index. */ +static inline Elf64_Shdr *section_by_idx(uint16_t idx) +{ + assert_ne(idx, SHN_UNDEF, "%u"); + return &elf.sh_table[idx]; +} + +/* + * Memory-map the given ELF file, perform sanity checks, and + * populate global state. + */ +static void init_elf(const char *path) +{ + int fd, ret; + struct stat stat; + + /* Store path in the global struct for error printing. */ + elf.path = path; + + /* Open the ELF file. */ + fd = open(path, O_RDONLY); + if (fd < 0) + fatal_perror("Could not open ELF file"); + + /* Get status of ELF file to obtain its size. */ + ret = fstat(fd, &stat); + if (ret < 0) { + close(fd); + fatal_perror("Could not get status of ELF file"); + } + + /* mmap() the entire ELF file read-only at an arbitrary address. */ + elf.begin = mmap(0, stat.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (elf.begin == MAP_FAILED) { + close(fd); + fatal_perror("Could not mmap ELF file"); + } + + /* mmap() was successful, close the FD. */ + close(fd); + + /* Get pointer to the ELF header. */ + assert_ge(stat.st_size, sizeof(*elf.ehdr), "%lu"); + elf.ehdr = elf_ptr(Elf64_Ehdr, 0); + + /* Check the ELF magic. */ + assert_eq(elf.ehdr->e_ident[EI_MAG0], ELFMAG0, "0x%x"); + assert_eq(elf.ehdr->e_ident[EI_MAG1], ELFMAG1, "0x%x"); + assert_eq(elf.ehdr->e_ident[EI_MAG2], ELFMAG2, "0x%x"); + assert_eq(elf.ehdr->e_ident[EI_MAG3], ELFMAG3, "0x%x"); + + /* Sanity check that this is an ELF64 relocatable object for AArch64. */ + assert_eq(elf.ehdr->e_ident[EI_CLASS], ELFCLASS64, "%u"); + assert_eq(elf.ehdr->e_ident[EI_DATA], ELFENDIAN, "%u"); + assert_eq(elf16toh(elf.ehdr->e_type), ET_REL, "%u"); + assert_eq(elf16toh(elf.ehdr->e_machine), EM_AARCH64, "%u"); + + /* Populate fields of the global struct. */ + elf.sh_table = section_by_off(elf64toh(elf.ehdr->e_shoff)); + elf.sh_string = section_begin(section_by_idx(elf16toh(elf.ehdr->e_shstrndx))); +} + +/* Print the prologue of the output ASM file. */ +static void emit_prologue(void) +{ + printf(".data\n" + ".pushsection " HYP_RELOC_SECTION ", \"a\"\n"); +} + +/* Print ASM statements needed as a prologue to a processed hyp section. */ +static void emit_section_prologue(const char *sh_orig_name) +{ + /* Declare the hyp section symbol. */ + printf(".global %s%s\n", HYP_SECTION_SYMBOL_PREFIX, sh_orig_name); +} + +/* + * Print ASM statements to create a hyp relocation entry for a given + * R_AARCH64_ABS64 relocation. + * + * The linker of vmlinux will populate the position given by `rela` with + * an absolute 64-bit kernel VA. If the kernel is relocatable, it will + * also generate a dynamic relocation entry so that the kernel can shift + * the address at runtime for KASLR. + * + * Emit a 32-bit offset from the current address to the position given + * by `rela`. This way the kernel can iterate over all kernel VAs used + * by hyp at runtime and convert them to hyp VAs. However, that offset + * will not be known until linking of `vmlinux`, so emit a PREL32 + * relocation referencing a symbol that the hyp linker script put at + * the beginning of the relocated section + the offset from `rela`. + */ +static void emit_rela_abs64(Elf64_Rela *rela, const char *sh_orig_name) +{ + /* Offset of this reloc from the beginning of HYP_RELOC_SECTION. */ + static size_t reloc_offset; + + /* Create storage for the 32-bit offset. */ + printf(".word 0\n"); + + /* + * Create a PREL32 relocation which instructs the linker of `vmlinux` + * to insert offset to position <base> + <offset>, where <base> is + * a symbol at the beginning of the relocated section, and <offset> + * is `rela->r_offset`. + */ + printf(".reloc %lu, R_AARCH64_PREL32, %s%s + 0x%lx\n", + reloc_offset, HYP_SECTION_SYMBOL_PREFIX, sh_orig_name, + elf64toh(rela->r_offset)); + + reloc_offset += 4; +} + +/* Print the epilogue of the output ASM file. */ +static void emit_epilogue(void) +{ + printf(".popsection\n"); +} + +/* + * Iterate over all RELA relocations in a given section and emit + * hyp relocation data for all absolute addresses in hyp code/data. + * + * Static relocations that generate PC-relative-addressing are ignored. + * Failure is reported for unexpected relocation types. + */ +static void emit_rela_section(Elf64_Shdr *sh_rela) +{ + Elf64_Shdr *sh_orig = &elf.sh_table[elf32toh(sh_rela->sh_info)]; + const char *sh_orig_name = section_name(sh_orig); + Elf64_Rela *rela; + + /* Skip all non-hyp sections. */ + if (!starts_with(sh_orig_name, HYP_SECTION_PREFIX)) + return; + + emit_section_prologue(sh_orig_name); + + for_each_rela(sh_rela, rela) { + uint32_t type = (uint32_t)elf64toh(rela->r_info); + + /* Check that rela points inside the relocated section. */ + assert_lt(elf64toh(rela->r_offset), elf64toh(sh_orig->sh_size), "0x%lx"); + + switch (type) { + /* + * Data relocations to generate absolute addressing. + * Emit a hyp relocation. + */ + case R_AARCH64_ABS64: + emit_rela_abs64(rela, sh_orig_name); + break; + /* Allow relocations to generate PC-relative addressing. */ + case R_AARCH64_LD_PREL_LO19: + case R_AARCH64_ADR_PREL_LO21: + case R_AARCH64_ADR_PREL_PG_HI21: + case R_AARCH64_ADR_PREL_PG_HI21_NC: + case R_AARCH64_ADD_ABS_LO12_NC: + case R_AARCH64_LDST8_ABS_LO12_NC: + case R_AARCH64_LDST16_ABS_LO12_NC: + case R_AARCH64_LDST32_ABS_LO12_NC: + case R_AARCH64_LDST64_ABS_LO12_NC: + case R_AARCH64_LDST128_ABS_LO12_NC: + break; + /* Allow relative relocations for control-flow instructions. */ + case R_AARCH64_TSTBR14: + case R_AARCH64_CONDBR19: + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: + break; + /* Allow group relocations to create PC-relative offset inline. */ + case R_AARCH64_MOVW_PREL_G0: + case R_AARCH64_MOVW_PREL_G0_NC: + case R_AARCH64_MOVW_PREL_G1: + case R_AARCH64_MOVW_PREL_G1_NC: + case R_AARCH64_MOVW_PREL_G2: + case R_AARCH64_MOVW_PREL_G2_NC: + case R_AARCH64_MOVW_PREL_G3: + break; + default: + fatal_error("Unexpected RELA type %u", type); + } + } +} + +/* Iterate over all sections and emit hyp relocation data for RELA sections. */ +static void emit_all_relocs(void) +{ + Elf64_Shdr *shdr; + + for_each_section(shdr) { + switch (elf32toh(shdr->sh_type)) { + case SHT_REL: + fatal_error("Unexpected SHT_REL section \"%s\"", + section_name(shdr)); + case SHT_RELA: + emit_rela_section(shdr); + break; + } + } +} + +int main(int argc, const char **argv) +{ + if (argc != 2) { + fprintf(stderr, "Usage: %s <elf_input>\n", argv[0]); + return EXIT_FAILURE; + } + + init_elf(argv[1]); + + emit_prologue(); + emit_all_relocs(); + emit_epilogue(); + + return EXIT_SUCCESS; +} diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index a820dfdc9c25..6585a7cbbc56 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -74,27 +74,28 @@ SYM_FUNC_END(__host_enter) * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) - /* Load the format arguments into x1-7 */ - mov x6, x3 - get_vcpu_ptr x7, x3 - - mrs x3, esr_el2 - mrs x4, far_el2 - mrs x5, hpfar_el2 - /* Prepare and exit to the host's panic funciton. */ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr ldr lr, =panic + hyp_kimg_va lr, x6 msr elr_el2, lr - /* - * Set the panic format string and enter the host, conditionally - * restoring the host context. - */ + /* Set the panic format string. Use the, now free, LR as scratch. */ + ldr lr, =__hyp_panic_string + hyp_kimg_va lr, x6 + + /* Load the format arguments into x1-7. */ + mov x6, x3 + get_vcpu_ptr x7, x3 + mrs x3, esr_el2 + mrs x4, far_el2 + mrs x5, hpfar_el2 + + /* Enter the host, conditionally restoring the host context. */ cmp x0, xzr - ldr x0, =__hyp_panic_string + mov x0, lr b.eq __host_enter_without_restoring b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) @@ -124,7 +125,7 @@ SYM_FUNC_END(__hyp_do_panic) * Preserve x0-x4, which may contain stub parameters. */ ldr x5, =__kvm_handle_stub_hvc - kimg_pa x5, x6 + hyp_pa x5, x6 br x5 .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 882fd40d068e..c631e29fb001 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -18,7 +18,7 @@ #include <asm/virt.h> .text - .pushsection .hyp.idmap.text, "ax" + .pushsection .idmap.text, "ax" .align 11 @@ -57,17 +57,10 @@ __do_hyp_init: cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc - // We only actively check bits [24:31], and everything - // else has to be zero, which we check at build time. -#if (KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) & 0xFFFFFFFF00FFFFFF) -#error Unexpected __KVM_HOST_SMCCC_FUNC___kvm_hyp_init value -#endif - - ror x0, x0, #24 - eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 24) & 0xF) - ror x0, x0, #4 - eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 28) & 0xF) - cbz x0, 1f + mov x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) + cmp x0, x3 + b.eq 1f + mov x0, #SMCCC_RET_NOT_SUPPORTED eret @@ -141,7 +134,6 @@ alternative_else_nop_endif /* Set the host vector */ ldr x0, =__kvm_hyp_host_vector - kimg_hyp_va x0, x1 msr vbar_el2, x0 ret @@ -200,7 +192,6 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) /* Leave idmap. */ mov x0, x29 ldr x1, =kvm_host_psci_cpu_entry - kimg_hyp_va x1, x2 br x1 SYM_CODE_END(__kvm_hyp_init_cpu) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index a906f9e2ff34..f012f8665ecc 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -108,9 +108,9 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) typedef void (*hcall_t)(struct kvm_cpu_context *); -#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = kimg_fn_ptr(handle_##x) +#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x -static const hcall_t *host_hcall[] = { +static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), @@ -130,7 +130,6 @@ static const hcall_t *host_hcall[] = { static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); - const hcall_t *kfn; hcall_t hfn; id -= KVM_HOST_SMCCC_ID(0); @@ -138,13 +137,11 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) if (unlikely(id >= ARRAY_SIZE(host_hcall))) goto inval; - kfn = host_hcall[id]; - if (unlikely(!kfn)) + hfn = host_hcall[id]; + if (unlikely(!hfn)) goto inval; cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS; - - hfn = kimg_fn_hyp_va(kfn); hfn(host_ctxt); return; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index 2997aa156d8e..879559057dee 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -33,8 +33,8 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu) if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)) hyp_panic(); - cpu_base_array = (unsigned long *)hyp_symbol_addr(kvm_arm_hyp_percpu_base); + cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base; this_cpu_base = kern_hyp_va(cpu_base_array[cpu]); - elf_base = (unsigned long)hyp_symbol_addr(__per_cpu_start); + elf_base = (unsigned long)&__per_cpu_start; return this_cpu_base - elf_base; } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index 1206d0d754d5..cd119d82d8e3 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -12,14 +12,17 @@ #include <asm/memory.h> SECTIONS { + HYP_SECTION(.idmap.text) HYP_SECTION(.text) + HYP_SECTION(.data..ro_after_init) + HYP_SECTION(.rodata) + /* * .hyp..data..percpu needs to be page aligned to maintain the same * alignment for when linking into vmlinux. */ . = ALIGN(PAGE_SIZE); - HYP_SECTION_NAME(.data..percpu) : { + BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) - } - HYP_SECTION(.data..ro_after_init) + END_HYP_SECTION } diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 8e7128cb7667..63de71c0481e 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -128,8 +128,8 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt) if (cpu_id == INVALID_CPU_ID) return PSCI_RET_INVALID_PARAMS; - boot_args = per_cpu_ptr(hyp_symbol_addr(cpu_on_args), cpu_id); - init_params = per_cpu_ptr(hyp_symbol_addr(kvm_init_params), cpu_id); + boot_args = per_cpu_ptr(&cpu_on_args, cpu_id); + init_params = per_cpu_ptr(&kvm_init_params, cpu_id); /* Check if the target CPU is already being booted. */ if (!try_acquire_boot_args(boot_args)) @@ -140,7 +140,7 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context *host_ctxt) wmb(); ret = psci_call(func_id, mpidr, - __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_entry)), + __hyp_pa(&kvm_hyp_cpu_entry), __hyp_pa(init_params)); /* If successful, the lock will be released by the target CPU. */ @@ -159,8 +159,8 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) struct psci_boot_args *boot_args; struct kvm_nvhe_init_params *init_params; - boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args)); - init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params)); + boot_args = this_cpu_ptr(&suspend_args); + init_params = this_cpu_ptr(&kvm_init_params); /* * No need to acquire a lock before writing to boot_args because a core @@ -174,7 +174,7 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) * point if it is a deep sleep state. */ return psci_call(func_id, power_state, - __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)), + __hyp_pa(&kvm_hyp_cpu_resume), __hyp_pa(init_params)); } @@ -186,8 +186,8 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) struct psci_boot_args *boot_args; struct kvm_nvhe_init_params *init_params; - boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args)); - init_params = this_cpu_ptr(hyp_symbol_addr(kvm_init_params)); + boot_args = this_cpu_ptr(&suspend_args); + init_params = this_cpu_ptr(&kvm_init_params); /* * No need to acquire a lock before writing to boot_args because a core @@ -198,7 +198,7 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) /* Will only return on error. */ return psci_call(func_id, - __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_resume)), + __hyp_pa(&kvm_hyp_cpu_resume), __hyp_pa(init_params), 0); } @@ -207,12 +207,12 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) struct psci_boot_args *boot_args; struct kvm_cpu_context *host_ctxt; - host_ctxt = &this_cpu_ptr(hyp_symbol_addr(kvm_host_data))->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; if (is_cpu_on) - boot_args = this_cpu_ptr(hyp_symbol_addr(cpu_on_args)); + boot_args = this_cpu_ptr(&cpu_on_args); else - boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args)); + boot_args = this_cpu_ptr(&suspend_args); cpu_reg(host_ctxt, 0) = boot_args->r0; write_sysreg_el2(boot_args->pc, SYS_ELR); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index bdf8e55ed308..4d177ce1d536 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -45,6 +45,10 @@ #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; @@ -170,10 +174,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) smp_store_release(ptep, pte); } -static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, - u32 level) +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) { - kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa); + kvm_pte_t pte = kvm_phys_to_pte(pa); u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : KVM_PTE_TYPE_BLOCK; @@ -181,12 +184,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, pte |= FIELD_PREP(KVM_PTE_TYPE, type); pte |= KVM_PTE_VALID; - /* Tolerate KVM recreating the exact same mapping. */ - if (kvm_pte_valid(old)) - return old == pte; - - smp_store_release(ptep, pte); - return true; + return pte; } static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, @@ -341,12 +339,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { + kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys; if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; - WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)); + /* Tolerate KVM recreating the exact same mapping */ + new = kvm_init_valid_leaf_pte(phys, data->attr, level); + if (old != new && !WARN_ON(kvm_pte_valid(old))) + smp_store_release(ptep, new); + data->phys += granule; return true; } @@ -461,34 +464,41 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, return 0; } -static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - struct stage2_map_data *data) +static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) { + kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys; + struct page *page = virt_to_page(ptep); if (!kvm_block_mapping_supported(addr, end, phys, level)) - return false; - - /* - * If the PTE was already valid, drop the refcount on the table - * early, as it will be bumped-up again in stage2_map_walk_leaf(). - * This ensures that the refcount stays constant across a valid to - * valid PTE update. - */ - if (kvm_pte_valid(*ptep)) - put_page(virt_to_page(ptep)); - - if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)) - goto out; + return -E2BIG; + + new = kvm_init_valid_leaf_pte(phys, data->attr, level); + if (kvm_pte_valid(old)) { + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS))) + return -EAGAIN; + + /* + * There's an existing different valid leaf entry, so perform + * break-before-make. + */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + put_page(page); + } - /* There's an existing valid leaf entry, so perform break-before-make */ - kvm_set_invalid_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); - kvm_set_valid_leaf_pte(ptep, phys, data->attr, level); -out: + smp_store_release(ptep, new); + get_page(page); data->phys += granule; - return true; + return 0; } static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, @@ -516,6 +526,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { + int ret; kvm_pte_t *childp, pte = *ptep; struct page *page = virt_to_page(ptep); @@ -526,8 +537,9 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return 0; } - if (stage2_map_walker_try_leaf(addr, end, level, ptep, data)) - goto out_get_page; + ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data); + if (ret != -E2BIG) + return ret; if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; @@ -551,9 +563,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, } kvm_set_table_pte(ptep, childp); - -out_get_page: get_page(page); + return 0; } diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 8f0585640241..87a54375bd6e 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -64,7 +64,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) } rd = kvm_vcpu_dabt_get_rd(vcpu); - addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; + addr = kvm_vgic_global_state.vcpu_hyp_va; addr += fault_ipa - vgic->vgic_cpu_base; if (kvm_vcpu_dabt_iswrite(vcpu)) { diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 25ea4ecb6449..ead21b98b620 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -71,6 +71,12 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) if (gpa != GPA_INVALID) val = gpa; break; + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + return kvm_trng_call(vcpu); default: return kvm_psci_call(vcpu); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7d2257cc5438..77cb2d28f2a4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -879,11 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PAGE_SIZE && !force_pte) vma_pagesize = transparent_hugepage_adjust(memslot, hva, &pfn, &fault_ipa); - if (writable) { + if (writable) prot |= KVM_PGTABLE_PROT_W; - kvm_set_pfn_dirty(pfn); - mark_page_dirty(kvm, gfn); - } if (fault_status != FSC_PERM && !device) clean_dcache_guest_page(pfn, vma_pagesize); @@ -911,11 +908,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, memcache); } + /* Mark the page dirty only if the fault is handled successfully */ + if (writable && !ret) { + kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); + } + out_unlock: spin_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); - return ret; + return ret != -EAGAIN ? ret : 0; } /* Resolve the access fault by making the page young again. */ diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 247422ac78a9..e9ec08b0b070 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -23,11 +23,11 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); static u32 kvm_pmu_event_mask(struct kvm *kvm) { switch (kvm->arch.pmuver) { - case 1: /* ARMv8.0 */ + case ID_AA64DFR0_PMUVER_8_0: return GENMASK(9, 0); - case 4: /* ARMv8.1 */ - case 5: /* ARMv8.4 */ - case 6: /* ARMv8.5 */ + case ID_AA64DFR0_PMUVER_8_1: + case ID_AA64DFR0_PMUVER_8_4: + case ID_AA64DFR0_PMUVER_8_5: return GENMASK(15, 0); default: /* Shouldn't be here, just for sanity */ WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver); @@ -795,6 +795,12 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) base = 0; } else { val = read_sysreg(pmceid1_el0); + /* + * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled + * as RAZ + */ + if (vcpu->kvm->arch.pmuver >= ID_AA64DFR0_PMUVER_8_4) + val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32); base = 32; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7c4f79532406..4f2f1e3145de 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -9,6 +9,7 @@ * Christoffer Dall <c.dall@virtualopensystems.com> */ +#include <linux/bitfield.h> #include <linux/bsearch.h> #include <linux/kvm_host.h> #include <linux/mm.h> @@ -700,14 +701,18 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 pmceid; + u64 pmceid, mask, shift; BUG_ON(p->is_write); if (pmu_access_el0_disabled(vcpu)) return false; + get_access_mask(r, &mask, &shift); + pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); + pmceid &= mask; + pmceid >>= shift; p->regval = pmceid; @@ -1021,6 +1026,8 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) @@ -1028,36 +1035,41 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, u32 id = reg_to_encoding(r); u64 val = raz ? 0 : read_sanitised_ftr_reg(id); - if (id == SYS_ID_AA64PFR0_EL1) { + switch (id) { + case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) - val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); - val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT); - val &= ~(0xfUL << ID_AA64PFR0_CSV2_SHIFT); - val |= ((u64)vcpu->kvm->arch.pfr0_csv2 << ID_AA64PFR0_CSV2_SHIFT); - val &= ~(0xfUL << ID_AA64PFR0_CSV3_SHIFT); - val |= ((u64)vcpu->kvm->arch.pfr0_csv3 << ID_AA64PFR0_CSV3_SHIFT); - } else if (id == SYS_ID_AA64PFR1_EL1) { - val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT); - } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) { - val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) | - (0xfUL << ID_AA64ISAR1_API_SHIFT) | - (0xfUL << ID_AA64ISAR1_GPA_SHIFT) | - (0xfUL << ID_AA64ISAR1_GPI_SHIFT)); - } else if (id == SYS_ID_AA64DFR0_EL1) { - u64 cap = 0; - - /* Limit guests to PMUv3 for ARMv8.1 */ - if (kvm_vcpu_has_pmu(vcpu)) - cap = ID_AA64DFR0_PMUVER_8_1; - + val &= ~FEATURE(ID_AA64PFR0_SVE); + val &= ~FEATURE(ID_AA64PFR0_AMU); + val &= ~FEATURE(ID_AA64PFR0_CSV2); + val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); + val &= ~FEATURE(ID_AA64PFR0_CSV3); + val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + break; + case SYS_ID_AA64PFR1_EL1: + val &= ~FEATURE(ID_AA64PFR1_MTE); + break; + case SYS_ID_AA64ISAR1_EL1: + if (!vcpu_has_ptrauth(vcpu)) + val &= ~(FEATURE(ID_AA64ISAR1_APA) | + FEATURE(ID_AA64ISAR1_API) | + FEATURE(ID_AA64ISAR1_GPA) | + FEATURE(ID_AA64ISAR1_GPI)); + break; + case SYS_ID_AA64DFR0_EL1: + /* Limit debug to ARMv8.0 */ + val &= ~FEATURE(ID_AA64DFR0_DEBUGVER); + val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6); + /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, - ID_AA64DFR0_PMUVER_SHIFT, - cap); - } else if (id == SYS_ID_DFR0_EL1) { - /* Limit guests to PMUv3 for ARMv8.1 */ + ID_AA64DFR0_PMUVER_SHIFT, + kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); + break; + case SYS_ID_DFR0_EL1: + /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, - ID_DFR0_PERFMON_SHIFT, - ID_DFR0_PERFMON_8_1); + ID_DFR0_PERFMON_SHIFT, + kvm_vcpu_has_pmu(vcpu) ? ID_DFR0_PERFMON_8_4 : 0); + break; } return val; @@ -1493,6 +1505,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pminten, .reg = PMINTENSET_EL1 }, { PMU_SYS_REG(SYS_PMINTENCLR_EL1), .access = access_pminten, .reg = PMINTENSET_EL1 }, + { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, @@ -1720,7 +1733,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, }; -static bool trap_dbgidr(struct kvm_vcpu *vcpu, +static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1734,7 +1747,7 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu, p->regval = ((((dfr >> ID_AA64DFR0_WRPS_SHIFT) & 0xf) << 28) | (((dfr >> ID_AA64DFR0_BRPS_SHIFT) & 0xf) << 24) | (((dfr >> ID_AA64DFR0_CTX_CMPS_SHIFT) & 0xf) << 20) - | (6 << 16) | (el3 << 14) | (el3 << 12)); + | (6 << 16) | (1 << 15) | (el3 << 14) | (el3 << 12)); return true; } } @@ -1767,8 +1780,8 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu, * guest. Revisit this one day, would this principle change. */ static const struct sys_reg_desc cp14_regs[] = { - /* DBGIDR */ - { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgidr }, + /* DBGDIDR */ + { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgdidr }, /* DBGDTRRXext */ { Op1( 0), CRn( 0), CRm( 0), Op2( 2), trap_raz_wi }, @@ -1918,8 +1931,8 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs }, { Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc }, { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr }, - { Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid }, - { Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid }, + { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid }, + { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid }, { Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_evcntr }, { Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_evtyper }, { Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_evcntr }, @@ -1927,6 +1940,10 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pminten }, { Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten }, { Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs }, + { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 4), access_pmceid }, + { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 5), access_pmceid }, + /* PMMIR */ + { Op1( 0), CRn( 9), CRm(14), Op2( 6), trap_raz_wi }, /* PRRR/MAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, diff --git a/arch/arm64/kvm/trng.c b/arch/arm64/kvm/trng.c new file mode 100644 index 000000000000..99bdd7103c9c --- /dev/null +++ b/arch/arm64/kvm/trng.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> + +#include <kvm/arm_hypercalls.h> + +#define ARM_SMCCC_TRNG_VERSION_1_0 0x10000UL + +/* Those values are deliberately separate from the generic SMCCC definitions. */ +#define TRNG_SUCCESS 0UL +#define TRNG_NOT_SUPPORTED ((unsigned long)-1) +#define TRNG_INVALID_PARAMETER ((unsigned long)-2) +#define TRNG_NO_ENTROPY ((unsigned long)-3) + +#define TRNG_MAX_BITS64 192 + +static const uuid_t arm_smc_trng_uuid __aligned(4) = UUID_INIT( + 0x0d21e000, 0x4384, 0x11eb, 0x80, 0x70, 0x52, 0x44, 0x55, 0x4e, 0x5a, 0x4c); + +static int kvm_trng_do_rnd(struct kvm_vcpu *vcpu, int size) +{ + DECLARE_BITMAP(bits, TRNG_MAX_BITS64); + u32 num_bits = smccc_get_arg1(vcpu); + int i; + + if (num_bits > 3 * size) { + smccc_set_retval(vcpu, TRNG_INVALID_PARAMETER, 0, 0, 0); + return 1; + } + + /* get as many bits as we need to fulfil the request */ + for (i = 0; i < DIV_ROUND_UP(num_bits, BITS_PER_LONG); i++) + bits[i] = get_random_long(); + + bitmap_clear(bits, num_bits, TRNG_MAX_BITS64 - num_bits); + + if (size == 32) + smccc_set_retval(vcpu, TRNG_SUCCESS, lower_32_bits(bits[1]), + upper_32_bits(bits[0]), lower_32_bits(bits[0])); + else + smccc_set_retval(vcpu, TRNG_SUCCESS, bits[2], bits[1], bits[0]); + + memzero_explicit(bits, sizeof(bits)); + return 1; +} + +int kvm_trng_call(struct kvm_vcpu *vcpu) +{ + const __le32 *u = (__le32 *)arm_smc_trng_uuid.b; + u32 func_id = smccc_get_function(vcpu); + unsigned long val = TRNG_NOT_SUPPORTED; + int size = 64; + + switch (func_id) { + case ARM_SMCCC_TRNG_VERSION: + val = ARM_SMCCC_TRNG_VERSION_1_0; + break; + case ARM_SMCCC_TRNG_FEATURES: + switch (smccc_get_arg1(vcpu)) { + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + val = TRNG_SUCCESS; + } + break; + case ARM_SMCCC_TRNG_GET_UUID: + smccc_set_retval(vcpu, le32_to_cpu(u[0]), le32_to_cpu(u[1]), + le32_to_cpu(u[2]), le32_to_cpu(u[3])); + return 1; + case ARM_SMCCC_TRNG_RND32: + size = 32; + fallthrough; + case ARM_SMCCC_TRNG_RND64: + return kvm_trng_do_rnd(vcpu, size); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 70fcd6a12fe1..978301392d67 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -81,6 +81,34 @@ __init void kvm_compute_layout(void) init_hyp_physvirt_offset(); } +/* + * The .hyp.reloc ELF section contains a list of kimg positions that + * contains kimg VAs but will be accessed only in hyp execution context. + * Convert them to hyp VAs. See gen-hyprel.c for more details. + */ +__init void kvm_apply_hyp_relocations(void) +{ + int32_t *rel; + int32_t *begin = (int32_t *)__hyp_reloc_begin; + int32_t *end = (int32_t *)__hyp_reloc_end; + + for (rel = begin; rel < end; ++rel) { + uintptr_t *ptr, kimg_va; + + /* + * Each entry contains a 32-bit relative offset from itself + * to a kimg VA position. + */ + ptr = (uintptr_t *)lm_alias((char *)rel + *rel); + + /* Read the kimg VA value at the relocation address. */ + kimg_va = *ptr; + + /* Convert to hyp VA and store back to the relocation address. */ + *ptr = __early_kern_hyp_va((uintptr_t)lm_alias(kimg_va)); + } +} + static u32 compute_instruction(int n, u32 rd, u32 rn) { u32 insn = AARCH64_BREAK_FAULT; @@ -255,12 +283,6 @@ static void generate_mov_q(u64 val, __le32 *origptr, __le32 *updptr, int nr_inst *updptr++ = cpu_to_le32(insn); } -void kvm_update_kimg_phys_offset(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, int nr_inst) -{ - generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst); -} - void kvm_get_kimage_voffset(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 24f3d0f9996b..3a5612e7304c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -83,7 +83,6 @@ #define KVM_MAX_VCPUS 16 -#define KVM_USER_MEM_SLOTS 16 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index 8a88eb265516..6ce2117e49f6 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -10,7 +10,6 @@ #define _ASM_SPINLOCK_H #include <asm/processor.h> -#include <asm/qrwlock.h> #include <asm-generic/qspinlock_types.h> @@ -27,5 +26,6 @@ static inline void queued_spin_unlock(struct qspinlock *lock) } #include <asm/qspinlock.h> +#include <asm/qrwlock.h> #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index c98f5141e3fc..ed6086d57b22 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -535,9 +535,12 @@ struct h_cpu_char_result { u64 behaviour; }; -/* Register state for entering a nested guest with H_ENTER_NESTED */ +/* + * Register state for entering a nested guest with H_ENTER_NESTED. + * New member must be added at the end. + */ struct hv_guest_state { - u64 version; /* version of this structure layout */ + u64 version; /* version of this structure layout, must be first */ u32 lpid; u32 vcpu_token; /* These registers are hypervisor privileged (at least for writing) */ @@ -566,10 +569,26 @@ struct hv_guest_state { u64 pidr; u64 cfar; u64 ppr; + /* Version 1 ends here */ + u64 dawr1; + u64 dawrx1; + /* Version 2 ends here */ }; /* Latest version of hv_guest_state structure */ -#define HV_GUEST_STATE_VERSION 1 +#define HV_GUEST_STATE_VERSION 2 + +static inline int hv_guest_state_size(unsigned int version) +{ + switch (version) { + case 1: + return offsetofend(struct hv_guest_state, ppr); + case 2: + return offsetofend(struct hv_guest_state, dawrx1); + default: + return -1; + } +} /* * From the document "H_GetPerformanceCounterInfo Interface" v1.07 diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 078f4648ea27..b6d31bff5209 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -74,16 +74,6 @@ struct kvm_split_mode { u8 do_nap; u8 napped[MAX_SMT_THREADS]; struct kvmppc_vcore *vc[MAX_SUBCORES]; - /* Bits for changing lpcr on P9 */ - unsigned long lpcr_req; - unsigned long lpidr_req; - unsigned long host_lpcr; - u32 do_set; - u32 do_restore; - union { - u32 allphases; - u8 phase[4]; - } lpcr_sync; }; /* @@ -110,7 +100,6 @@ struct kvmppc_host_state { u8 hwthread_state; u8 host_ipi; u8 ptid; /* thread number within subcore when split */ - u8 tid; /* thread number within whole core */ u8 fake_suspend; struct kvm_vcpu *kvm_vcpu; struct kvmppc_vcore *kvm_vcore; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d67a470e95a3..05fb00d37609 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -28,7 +28,6 @@ #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS -#define KVM_USER_MEM_SLOTS 512 #include <asm/cputhreads.h> @@ -307,6 +306,7 @@ struct kvm_arch { u8 svm_enabled; bool threads_indep; bool nested_enable; + bool dawr1_enabled; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; @@ -584,8 +584,10 @@ struct kvm_vcpu_arch { u32 ctrl; u32 dabrx; ulong dabr; - ulong dawr; - ulong dawrx; + ulong dawr0; + ulong dawrx0; + ulong dawr1; + ulong dawrx1; ulong ciabr; ulong cfar; ulong ppr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 0a056c64c317..df4bda867bab 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -314,6 +314,8 @@ struct kvmppc_ops { int size); int (*enable_svm)(struct kvm *kvm); int (*svm_off)(struct kvm *kvm); + int (*enable_dawr1)(struct kvm *kvm); + bool (*hash_v3_possible)(void); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c3af3f324c5a..9f18fa090f1f 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -644,6 +644,8 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_MMCR3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1) #define KVM_REG_PPC_SIER2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2) #define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) +#define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4) +#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b12d7c049bfe..b690c70f061c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -526,8 +526,10 @@ int main(void) OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl); OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr); OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx); - OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr); - OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx); + OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0); + OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0); + OFFSET(VCPU_DAWR1, kvm_vcpu, arch.dawr1); + OFFSET(VCPU_DAWRX1, kvm_vcpu, arch.dawrx1); OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr); OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags); OFFSET(VCPU_DEC, kvm_vcpu, arch.dec); @@ -668,7 +670,6 @@ int main(void) HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); - HSTATE_FIELD(HSTATE_TID, tid); HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend); HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); @@ -698,8 +699,6 @@ int main(void) OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); - OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); - OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6f612d240392..f09708da216e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -134,7 +134,7 @@ static inline bool nesting_enabled(struct kvm *kvm) } /* If set, the threads on each CPU core have to be in the same MMU mode */ -static bool no_mixing_hpt_and_radix; +static bool no_mixing_hpt_and_radix __read_mostly; static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -782,8 +782,24 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, return H_UNSUPPORTED_FLAG_START; if (value2 & DABRX_HYP) return H_P4; - vcpu->arch.dawr = value1; - vcpu->arch.dawrx = value2; + vcpu->arch.dawr0 = value1; + vcpu->arch.dawrx0 = value2; + return H_SUCCESS; + case H_SET_MODE_RESOURCE_SET_DAWR1: + if (!kvmppc_power8_compatible(vcpu)) + return H_P2; + if (!ppc_breakpoint_available()) + return H_P2; + if (!cpu_has_feature(CPU_FTR_DAWR1)) + return H_P2; + if (!vcpu->kvm->arch.dawr1_enabled) + return H_FUNCTION; + if (mflags) + return H_UNSUPPORTED_FLAG_START; + if (value2 & DABRX_HYP) + return H_P4; + vcpu->arch.dawr1 = value1; + vcpu->arch.dawrx1 = value2; return H_SUCCESS; case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: /* KVM does not support mflags=2 (AIL=2) */ @@ -1759,10 +1775,16 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.vcore->vtb); break; case KVM_REG_PPC_DAWR: - *val = get_reg_val(id, vcpu->arch.dawr); + *val = get_reg_val(id, vcpu->arch.dawr0); break; case KVM_REG_PPC_DAWRX: - *val = get_reg_val(id, vcpu->arch.dawrx); + *val = get_reg_val(id, vcpu->arch.dawrx0); + break; + case KVM_REG_PPC_DAWR1: + *val = get_reg_val(id, vcpu->arch.dawr1); + break; + case KVM_REG_PPC_DAWRX1: + *val = get_reg_val(id, vcpu->arch.dawrx1); break; case KVM_REG_PPC_CIABR: *val = get_reg_val(id, vcpu->arch.ciabr); @@ -1991,10 +2013,16 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, vcpu->arch.vcore->vtb = set_reg_val(id, *val); break; case KVM_REG_PPC_DAWR: - vcpu->arch.dawr = set_reg_val(id, *val); + vcpu->arch.dawr0 = set_reg_val(id, *val); break; case KVM_REG_PPC_DAWRX: - vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; + vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP; + break; + case KVM_REG_PPC_DAWR1: + vcpu->arch.dawr1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWRX1: + vcpu->arch.dawrx1 = set_reg_val(id, *val) & ~DAWRX_HYP; break; case KVM_REG_PPC_CIABR: vcpu->arch.ciabr = set_reg_val(id, *val); @@ -2862,11 +2890,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm) return false; - /* Some POWER9 chips require all threads to be in the same MMU mode */ - if (no_mixing_hpt_and_radix && - kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) - return false; - if (n_threads < cip->max_subcore_threads) n_threads = cip->max_subcore_threads; if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) @@ -2905,6 +2928,9 @@ static void prepare_threads(struct kvmppc_vcore *vc) for_each_runnable_thread(i, vcpu, vc) { if (signal_pending(vcpu->arch.run_task)) vcpu->arch.ret = -EINTR; + else if (no_mixing_hpt_and_radix && + kvm_is_radix(vc->kvm) != radix_enabled()) + vcpu->arch.ret = -EINVAL; else if (vcpu->arch.vpa.update_pending || vcpu->arch.slb_shadow.update_pending || vcpu->arch.dtl.update_pending) @@ -3110,7 +3136,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int controlled_threads; int trap; bool is_power8; - bool hpt_on_radix; /* * Remove from the list any threads that have a signal pending @@ -3143,11 +3168,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * this is a HPT guest on a radix host machine where the * CPU threads may not be in different MMU modes. */ - hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() && - !kvm_is_radix(vc->kvm); - if (((controlled_threads > 1) && - ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || - (hpt_on_radix && vc->kvm->arch.threads_indep)) { + if ((controlled_threads > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); @@ -3215,7 +3237,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) && !cpu_has_feature(CPU_FTR_ARCH_300); - if (split > 1 || hpt_on_radix) { + if (split > 1) { sip = &split_info; memset(&split_info, 0, sizeof(split_info)); for (sub = 0; sub < core_info.n_subcores; ++sub) @@ -3237,13 +3259,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) split_info.subcore_size = subcore_size; } else { split_info.subcore_size = 1; - if (hpt_on_radix) { - /* Use the split_info for LPCR/LPIDR changes */ - split_info.lpcr_req = vc->lpcr; - split_info.lpidr_req = vc->kvm->arch.lpid; - split_info.host_lpcr = vc->kvm->arch.host_lpcr; - split_info.do_set = 1; - } } /* order writes to split_info before kvm_split_mode pointer */ @@ -3253,7 +3268,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) for (thr = 0; thr < controlled_threads; ++thr) { struct paca_struct *paca = paca_ptrs[pcpu + thr]; - paca->kvm_hstate.tid = thr; paca->kvm_hstate.napping = 0; paca->kvm_hstate.kvm_split_mode = sip; } @@ -3327,10 +3341,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * When doing micro-threading, poke the inactive threads as well. * This gets them to the nap instruction after kvm_do_nap, * which reduces the time taken to unsplit later. - * For POWER9 HPT guest on radix host, we need all the secondary - * threads woken up so they can do the LPCR/LPIDR change. */ - if (cmd_bit || hpt_on_radix) { + if (cmd_bit) { split_info.do_nap = 1; /* ask secondaries to nap when done */ for (thr = 1; thr < threads_per_subcore; ++thr) if (!(active & (1 << thr))) @@ -3391,19 +3403,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cpu_relax(); ++loops; } - } else if (hpt_on_radix) { - /* Wait for all threads to have seen final sync */ - for (thr = 1; thr < controlled_threads; ++thr) { - struct paca_struct *paca = paca_ptrs[pcpu + thr]; - - while (paca->kvm_hstate.kvm_split_mode) { - HMT_low(); - barrier(); - } - HMT_medium(); - } + split_info.do_nap = 0; } - split_info.do_nap = 0; kvmppc_set_host_core(pcpu); @@ -3449,10 +3450,17 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, int trap; unsigned long host_hfscr = mfspr(SPRN_HFSCR); unsigned long host_ciabr = mfspr(SPRN_CIABR); - unsigned long host_dawr = mfspr(SPRN_DAWR0); - unsigned long host_dawrx = mfspr(SPRN_DAWRX0); + unsigned long host_dawr0 = mfspr(SPRN_DAWR0); + unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0); unsigned long host_psscr = mfspr(SPRN_PSSCR); unsigned long host_pidr = mfspr(SPRN_PID); + unsigned long host_dawr1 = 0; + unsigned long host_dawrx1 = 0; + + if (cpu_has_feature(CPU_FTR_DAWR1)) { + host_dawr1 = mfspr(SPRN_DAWR1); + host_dawrx1 = mfspr(SPRN_DAWRX1); + } /* * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0, @@ -3489,8 +3497,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_SPURR, vcpu->arch.spurr); if (dawr_enabled()) { - mtspr(SPRN_DAWR0, vcpu->arch.dawr); - mtspr(SPRN_DAWRX0, vcpu->arch.dawrx); + mtspr(SPRN_DAWR0, vcpu->arch.dawr0); + mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0); + if (cpu_has_feature(CPU_FTR_DAWR1)) { + mtspr(SPRN_DAWR1, vcpu->arch.dawr1); + mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1); + } } mtspr(SPRN_CIABR, vcpu->arch.ciabr); mtspr(SPRN_IC, vcpu->arch.ic); @@ -3542,8 +3554,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); mtspr(SPRN_HFSCR, host_hfscr); mtspr(SPRN_CIABR, host_ciabr); - mtspr(SPRN_DAWR0, host_dawr); - mtspr(SPRN_DAWRX0, host_dawrx); + mtspr(SPRN_DAWR0, host_dawr0); + mtspr(SPRN_DAWRX0, host_dawrx0); + if (cpu_has_feature(CPU_FTR_DAWR1)) { + mtspr(SPRN_DAWR1, host_dawr1); + mtspr(SPRN_DAWRX1, host_dawrx1); + } mtspr(SPRN_PID, host_pidr); /* @@ -3595,6 +3611,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_tidr = mfspr(SPRN_TIDR); unsigned long host_iamr = mfspr(SPRN_IAMR); unsigned long host_amr = mfspr(SPRN_AMR); + unsigned long host_fscr = mfspr(SPRN_FSCR); s64 dec; u64 tb; int trap, save_pmu; @@ -3735,6 +3752,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (host_amr != vcpu->arch.amr) mtspr(SPRN_AMR, host_amr); + if (host_fscr != vcpu->arch.fscr) + mtspr(SPRN_FSCR, host_fscr); + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); store_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC @@ -4173,7 +4193,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, kvmppc_clear_host_core(pcpu); - local_paca->kvm_hstate.tid = 0; local_paca->kvm_hstate.napping = 0; local_paca->kvm_hstate.kvm_split_mode = NULL; kvmppc_start_thread(vcpu, vc); @@ -4358,15 +4377,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) do { /* - * The early POWER9 chips that can't mix radix and HPT threads - * on the same core also need the workaround for the problem - * where the TLB would prefetch entries in the guest exit path - * for radix guests using the guest PIDR value and LPID 0. - * The workaround is in the old path (kvmppc_run_vcpu()) - * but not the new path (kvmhv_run_single_vcpu()). + * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu + * path, which also handles hash and dependent threads mode. */ if (kvm->arch.threads_indep && kvm_is_radix(kvm) && - !no_mixing_hpt_and_radix) + !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) r = kvmhv_run_single_vcpu(vcpu, ~(u64)0, vcpu->arch.vcore->lpcr); else @@ -5599,6 +5614,26 @@ out: return ret; } +static int kvmhv_enable_dawr1(struct kvm *kvm) +{ + if (!cpu_has_feature(CPU_FTR_DAWR1)) + return -ENODEV; + + /* kvm == NULL means the caller is testing if the capability exists */ + if (kvm) + kvm->arch.dawr1_enabled = true; + return 0; +} + +static bool kvmppc_hash_v3_possible(void) +{ + if (radix_enabled() && no_mixing_hpt_and_radix) + return false; + + return cpu_has_feature(CPU_FTR_ARCH_300) && + cpu_has_feature(CPU_FTR_HVMODE); +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -5642,6 +5677,8 @@ static struct kvmppc_ops kvm_ops_hv = { .store_to_eaddr = kvmhv_store_to_eaddr, .enable_svm = kvmhv_enable_svm, .svm_off = kvmhv_svm_off, + .enable_dawr1 = kvmhv_enable_dawr1, + .hash_v3_possible = kvmppc_hash_v3_possible, }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 8053efdf7ea7..f3d3183249fe 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -277,8 +277,7 @@ void kvmhv_commence_exit(int trap) struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; int ptid = local_paca->kvm_hstate.ptid; struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; - int me, ee, i, t; - int cpu0; + int me, ee, i; /* Set our bit in the threads-exiting-guest map in the 0xff00 bits of vcore->entry_exit_map */ @@ -320,22 +319,6 @@ void kvmhv_commence_exit(int trap) if ((ee >> 8) == 0) kvmhv_interrupt_vcore(vc, ee); } - - /* - * On POWER9 when running a HPT guest on a radix host (sip != NULL), - * we have to interrupt inactive CPU threads to get them to - * restore the host LPCR value. - */ - if (sip->lpcr_req) { - if (cmpxchg(&sip->do_restore, 0, 1) == 0) { - vc = local_paca->kvm_hstate.kvm_vcore; - cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid; - for (t = 1; t < threads_per_core; ++t) { - if (sip->napped[t]) - kvmhv_rm_send_ipi(cpu0 + t); - } - } - } } struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; @@ -667,95 +650,6 @@ void kvmppc_bad_interrupt(struct pt_regs *regs) panic("Bad KVM trap"); } -/* - * Functions used to switch LPCR HR and UPRT bits on all threads - * when entering and exiting HPT guests on a radix host. - */ - -#define PHASE_REALMODE 1 /* in real mode */ -#define PHASE_SET_LPCR 2 /* have set LPCR */ -#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */ -#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */ - -#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p)) - -static void wait_for_sync(struct kvm_split_mode *sip, int phase) -{ - int thr = local_paca->kvm_hstate.tid; - - sip->lpcr_sync.phase[thr] |= phase; - phase = ALL(phase); - while ((sip->lpcr_sync.allphases & phase) != phase) { - HMT_low(); - barrier(); - } - HMT_medium(); -} - -void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip) -{ - int num_sets; - unsigned long rb, set; - - /* wait for every other thread to get to real mode */ - wait_for_sync(sip, PHASE_REALMODE); - - /* Set LPCR and LPIDR */ - mtspr(SPRN_LPCR, sip->lpcr_req); - mtspr(SPRN_LPID, sip->lpidr_req); - isync(); - - /* - * P10 will flush all the congruence class with a single tlbiel - */ - if (cpu_has_feature(CPU_FTR_ARCH_31)) - num_sets = 1; - else - num_sets = POWER9_TLB_SETS_RADIX; - - /* Invalidate the TLB on thread 0 */ - if (local_paca->kvm_hstate.tid == 0) { - sip->do_set = 0; - asm volatile("ptesync" : : : "memory"); - for (set = 0; set < num_sets; ++set) { - rb = TLBIEL_INVAL_SET_LPID + - (set << TLBIEL_INVAL_SET_SHIFT); - asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : : - "r" (rb), "r" (0)); - } - asm volatile("ptesync" : : : "memory"); - } - - /* indicate that we have done so and wait for others */ - wait_for_sync(sip, PHASE_SET_LPCR); - /* order read of sip->lpcr_sync.allphases vs. sip->do_set */ - smp_rmb(); -} - -/* - * Called when a thread that has been in the guest needs - * to reload the host LPCR value - but only on POWER9 when - * running a HPT guest on a radix host. - */ -void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) -{ - /* we're out of the guest... */ - wait_for_sync(sip, PHASE_OUT_OF_GUEST); - - mtspr(SPRN_LPID, 0); - mtspr(SPRN_LPCR, sip->host_lpcr); - isync(); - - if (local_paca->kvm_hstate.tid == 0) { - sip->do_restore = 0; - smp_wmb(); /* order store of do_restore vs. phase */ - } - - wait_for_sync(sip, PHASE_RESET_LPCR); - smp_mb(); - local_paca->kvm_hstate.kvm_split_mode = NULL; -} - static void kvmppc_end_cede(struct kvm_vcpu *vcpu) { vcpu->arch.ceded = 0; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 33b58549a9aa..0cd0e7aad588 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -33,8 +33,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) hr->dpdes = vc->dpdes; hr->hfscr = vcpu->arch.hfscr; hr->tb_offset = vc->tb_offset; - hr->dawr0 = vcpu->arch.dawr; - hr->dawrx0 = vcpu->arch.dawrx; + hr->dawr0 = vcpu->arch.dawr0; + hr->dawrx0 = vcpu->arch.dawrx0; hr->ciabr = vcpu->arch.ciabr; hr->purr = vcpu->arch.purr; hr->spurr = vcpu->arch.spurr; @@ -49,6 +49,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) hr->pidr = vcpu->arch.pid; hr->cfar = vcpu->arch.cfar; hr->ppr = vcpu->arch.ppr; + hr->dawr1 = vcpu->arch.dawr1; + hr->dawrx1 = vcpu->arch.dawrx1; } static void byteswap_pt_regs(struct pt_regs *regs) @@ -91,6 +93,8 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->pidr = swab64(hr->pidr); hr->cfar = swab64(hr->cfar); hr->ppr = swab64(hr->ppr); + hr->dawr1 = swab64(hr->dawr1); + hr->dawrx1 = swab64(hr->dawrx1); } static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, @@ -138,6 +142,7 @@ static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) /* Don't let data address watchpoint match in hypervisor state */ hr->dawrx0 &= ~DAWRX_HYP; + hr->dawrx1 &= ~DAWRX_HYP; /* Don't let completed instruction address breakpt match in HV state */ if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) @@ -151,8 +156,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) vc->pcr = hr->pcr | PCR_MASK; vc->dpdes = hr->dpdes; vcpu->arch.hfscr = hr->hfscr; - vcpu->arch.dawr = hr->dawr0; - vcpu->arch.dawrx = hr->dawrx0; + vcpu->arch.dawr0 = hr->dawr0; + vcpu->arch.dawrx0 = hr->dawrx0; vcpu->arch.ciabr = hr->ciabr; vcpu->arch.purr = hr->purr; vcpu->arch.spurr = hr->spurr; @@ -167,6 +172,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) vcpu->arch.pid = hr->pidr; vcpu->arch.cfar = hr->cfar; vcpu->arch.ppr = hr->ppr; + vcpu->arch.dawr1 = hr->dawr1; + vcpu->arch.dawrx1 = hr->dawrx1; } void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, @@ -215,12 +222,51 @@ static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr) } } +static int kvmhv_read_guest_state_and_regs(struct kvm_vcpu *vcpu, + struct hv_guest_state *l2_hv, + struct pt_regs *l2_regs, + u64 hv_ptr, u64 regs_ptr) +{ + int size; + + if (kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv->version, + sizeof(l2_hv->version))) + return -1; + + if (kvmppc_need_byteswap(vcpu)) + l2_hv->version = swab64(l2_hv->version); + + size = hv_guest_state_size(l2_hv->version); + if (size < 0) + return -1; + + return kvm_vcpu_read_guest(vcpu, hv_ptr, l2_hv, size) || + kvm_vcpu_read_guest(vcpu, regs_ptr, l2_regs, + sizeof(struct pt_regs)); +} + +static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu, + struct hv_guest_state *l2_hv, + struct pt_regs *l2_regs, + u64 hv_ptr, u64 regs_ptr) +{ + int size; + + size = hv_guest_state_size(l2_hv->version); + if (size < 0) + return -1; + + return kvm_vcpu_write_guest(vcpu, hv_ptr, l2_hv, size) || + kvm_vcpu_write_guest(vcpu, regs_ptr, l2_regs, + sizeof(struct pt_regs)); +} + long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) { long int err, r; struct kvm_nested_guest *l2; struct pt_regs l2_regs, saved_l1_regs; - struct hv_guest_state l2_hv, saved_l1_hv; + struct hv_guest_state l2_hv = {0}, saved_l1_hv; struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 hv_ptr, regs_ptr; u64 hdec_exp; @@ -235,17 +281,15 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) hv_ptr = kvmppc_get_gpr(vcpu, 4); regs_ptr = kvmppc_get_gpr(vcpu, 5); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, - sizeof(struct hv_guest_state)) || - kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, - sizeof(struct pt_regs)); + err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs, + hv_ptr, regs_ptr); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (err) return H_PARAMETER; if (kvmppc_need_byteswap(vcpu)) byteswap_hv_regs(&l2_hv); - if (l2_hv.version != HV_GUEST_STATE_VERSION) + if (l2_hv.version > HV_GUEST_STATE_VERSION) return H_P2; if (kvmppc_need_byteswap(vcpu)) @@ -325,10 +369,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) byteswap_pt_regs(&l2_regs); } vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, - sizeof(struct hv_guest_state)) || - kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, - sizeof(struct pt_regs)); + err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs, + hv_ptr, regs_ptr); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (err) return H_AUTHORITY; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index cd9995ee8441..5e634db4809b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -52,11 +52,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_PID (SFS-32) #define STACK_SLOT_IAMR (SFS-40) #define STACK_SLOT_CIABR (SFS-48) -#define STACK_SLOT_DAWR (SFS-56) -#define STACK_SLOT_DAWRX (SFS-64) +#define STACK_SLOT_DAWR0 (SFS-56) +#define STACK_SLOT_DAWRX0 (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) #define STACK_SLOT_AMR (SFS-80) #define STACK_SLOT_UAMOR (SFS-88) +#define STACK_SLOT_DAWR1 (SFS-96) +#define STACK_SLOT_DAWRX1 (SFS-104) /* the following is used by the P9 short path */ #define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ @@ -85,19 +87,6 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) RFI_TO_KERNEL kvmppc_call_hv_entry: -BEGIN_FTR_SECTION - /* On P9, do LPCR setting, if necessary */ - ld r3, HSTATE_SPLIT_MODE(r13) - cmpdi r3, 0 - beq 46f - lwz r4, KVM_SPLIT_DO_SET(r3) - cmpwi r4, 0 - beq 46f - bl kvmhv_p9_set_lpcr - nop -46: -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_hv_entry @@ -361,11 +350,11 @@ kvm_secondary_got_guest: LOAD_REG_ADDR(r6, decrementer_max) ld r6, 0(r6) mtspr SPRN_HDEC, r6 +BEGIN_FTR_SECTION /* and set per-LPAR registers, if doing dynamic micro-threading */ ld r6, HSTATE_SPLIT_MODE(r13) cmpdi r6, 0 beq 63f -BEGIN_FTR_SECTION ld r0, KVM_SPLIT_RPR(r6) mtspr SPRN_RPR, r0 ld r0, KVM_SPLIT_PMMAR(r6) @@ -373,16 +362,7 @@ BEGIN_FTR_SECTION ld r0, KVM_SPLIT_LDBAR(r6) mtspr SPRN_LDBAR, r0 isync -FTR_SECTION_ELSE - /* On P9 we use the split_info for coordinating LPCR changes */ - lwz r4, KVM_SPLIT_DO_SET(r6) - cmpwi r4, 0 - beq 1f - mr r3, r6 - bl kvmhv_p9_set_lpcr - nop -1: -ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 63: /* Order load of vcpu after load of vcore */ lwsync @@ -452,19 +432,15 @@ kvm_no_guest: mtcr r5 blr -53: HMT_LOW +53: +BEGIN_FTR_SECTION + HMT_LOW ld r5, HSTATE_KVM_VCORE(r13) cmpdi r5, 0 bne 60f ld r3, HSTATE_SPLIT_MODE(r13) cmpdi r3, 0 beq kvm_no_guest - lwz r0, KVM_SPLIT_DO_SET(r3) - cmpwi r0, 0 - bne kvmhv_do_set - lwz r0, KVM_SPLIT_DO_RESTORE(r3) - cmpwi r0, 0 - bne kvmhv_do_restore lbz r0, KVM_SPLIT_DO_NAP(r3) cmpwi r0, 0 beq kvm_no_guest @@ -472,24 +448,19 @@ kvm_no_guest: b kvm_unsplit_nap 60: HMT_MEDIUM b kvm_secondary_got_guest +FTR_SECTION_ELSE + HMT_LOW + ld r5, HSTATE_KVM_VCORE(r13) + cmpdi r5, 0 + beq kvm_no_guest + HMT_MEDIUM + b kvm_secondary_got_guest +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 54: li r0, KVM_HWTHREAD_IN_KVM stb r0, HSTATE_HWTHREAD_STATE(r13) b kvm_no_guest -kvmhv_do_set: - /* Set LPCR, LPIDR etc. on P9 */ - HMT_MEDIUM - bl kvmhv_p9_set_lpcr - nop - b kvm_no_guest - -kvmhv_do_restore: - HMT_MEDIUM - bl kvmhv_p9_restore_lpcr - nop - b kvm_no_guest - /* * Here the primary thread is trying to return the core to * whole-core mode, so we need to nap. @@ -527,7 +498,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Set kvm_split_mode.napped[tid] = 1 */ ld r3, HSTATE_SPLIT_MODE(r13) li r0, 1 - lbz r4, HSTATE_TID(r13) + lhz r4, PACAPACAINDEX(r13) + clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */ addi r4, r4, KVM_SPLIT_NAPPED stbx r0, r3, r4 /* Check the do_nap flag again after setting napped[] */ @@ -711,10 +683,16 @@ BEGIN_FTR_SECTION mfspr r7, SPRN_DAWRX0 mfspr r8, SPRN_IAMR std r5, STACK_SLOT_CIABR(r1) - std r6, STACK_SLOT_DAWR(r1) - std r7, STACK_SLOT_DAWRX(r1) + std r6, STACK_SLOT_DAWR0(r1) + std r7, STACK_SLOT_DAWRX0(r1) std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +BEGIN_FTR_SECTION + mfspr r6, SPRN_DAWR1 + mfspr r7, SPRN_DAWRX1 + std r6, STACK_SLOT_DAWR1(r1) + std r7, STACK_SLOT_DAWRX1(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1) mfspr r5, SPRN_AMR std r5, STACK_SLOT_AMR(r1) @@ -801,10 +779,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) lbz r5, 0(r5) cmpdi r5, 0 beq 1f - ld r5, VCPU_DAWR(r4) - ld r6, VCPU_DAWRX(r4) + ld r5, VCPU_DAWR0(r4) + ld r6, VCPU_DAWRX0(r4) mtspr SPRN_DAWR0, r5 mtspr SPRN_DAWRX0, r6 +BEGIN_FTR_SECTION + ld r5, VCPU_DAWR1(r4) + ld r6, VCPU_DAWRX1(r4) + mtspr SPRN_DAWR1, r5 + mtspr SPRN_DAWRX1, r6 +END_FTR_SECTION_IFSET(CPU_FTR_DAWR1) 1: ld r7, VCPU_CIABR(r4) ld r8, VCPU_TAR(r4) @@ -918,15 +902,19 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon - /* For hash guest, clear out and reload the SLB */ ld r6, VCPU_KVM(r4) lbz r0, KVM_RADIX(r6) cmpwi r0, 0 bne 9f + + /* For hash guest, clear out and reload the SLB */ +BEGIN_MMU_FTR_SECTION + /* Radix host won't have populated the SLB, so no need to clear */ li r6, 0 slbmte r6, r6 - slbia + PPC_SLBIA(6) ptesync +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ lwz r5,VCPU_SLB_MAX(r4) @@ -1187,6 +1175,20 @@ EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) mr r4, r3 b fast_guest_entry_c guest_exit_short_path: + /* + * Malicious or buggy radix guests may have inserted SLB entries + * (only 0..3 because radix always runs with UPRT=1), so these must + * be cleared here to avoid side-channels. slbmte is used rather + * than slbia, as it won't clear cached translations. + */ + li r0,0 + slbmte r0,r0 + li r4,1 + slbmte r0,r4 + li r4,2 + slbmte r0,r4 + li r4,3 + slbmte r0,r4 li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) @@ -1499,7 +1501,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ lbz r0, KVM_RADIX(r5) li r5, 0 cmpwi r0, 0 - bne 3f /* for radix, save 0 entries */ + bne 0f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 @@ -1518,13 +1520,13 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Finally clear out the SLB */ li r0,0 slbmte r0,r0 - slbia + PPC_SLBIA(6) ptesync -3: stw r5,VCPU_SLB_MAX(r9) + stw r5,VCPU_SLB_MAX(r9) /* load host SLB entries */ BEGIN_MMU_FTR_SECTION - b 0f + b guest_bypass END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r8,PACA_SLBSHADOWPTR(r13) @@ -1538,7 +1540,21 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) slbmte r6,r5 1: addi r8,r8,16 .endr -0: + b guest_bypass + +0: /* + * Sanitise radix guest SLB, see guest_exit_short_path comment. + * We clear vcpu->arch.slb_max to match earlier behaviour. + */ + li r0,0 + stw r0,VCPU_SLB_MAX(r9) + slbmte r0,r0 + li r4,1 + slbmte r0,r4 + li r4,2 + slbmte r0,r4 + li r4,3 + slbmte r0,r4 guest_bypass: stw r12, STACK_SLOT_TRAP(r1) @@ -1759,8 +1775,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* Restore host values of some registers */ BEGIN_FTR_SECTION ld r5, STACK_SLOT_CIABR(r1) - ld r6, STACK_SLOT_DAWR(r1) - ld r7, STACK_SLOT_DAWRX(r1) + ld r6, STACK_SLOT_DAWR0(r1) + ld r7, STACK_SLOT_DAWRX0(r1) mtspr SPRN_CIABR, r5 /* * If the DAWR doesn't work, it's ok to write these here as @@ -1770,6 +1786,12 @@ BEGIN_FTR_SECTION mtspr SPRN_DAWRX0, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION + ld r6, STACK_SLOT_DAWR1(r1) + ld r7, STACK_SLOT_DAWRX1(r1) + mtspr SPRN_DAWR1, r6 + mtspr SPRN_DAWRX1, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1) +BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) @@ -1938,24 +1960,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 19: lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 -16: -BEGIN_FTR_SECTION - /* On POWER9 with HPT-on-radix we need to wait for all other threads */ - ld r3, HSTATE_SPLIT_MODE(r13) - cmpdi r3, 0 - beq 47f - lwz r8, KVM_SPLIT_DO_RESTORE(r3) - cmpwi r8, 0 - beq 47f - bl kvmhv_p9_restore_lpcr - nop - b 48f -47: -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - ld r8,KVM_HOST_LPCR(r4) +16: ld r8,KVM_HOST_LPCR(r4) mtspr SPRN_LPCR,r8 isync -48: + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING /* Finish timing, if we have a vcpu */ ld r4, HSTATE_KVM_VCPU(r13) @@ -2574,8 +2582,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW rlwimi r5, r4, 2, DAWRX_WT clrrdi r4, r4, 3 - std r4, VCPU_DAWR(r3) - std r5, VCPU_DAWRX(r3) + std r4, VCPU_DAWR0(r3) + std r5, VCPU_DAWRX0(r3) /* * If came in through the real mode hcall handler then it is necessary * to write the registers since the return path won't. Otherwise it is @@ -2779,8 +2787,10 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) beq kvm_end_cede cmpwi r0, NAPPING_NOVCPU beq kvm_novcpu_wakeup +BEGIN_FTR_SECTION cmpwi r0, NAPPING_UNSPLIT beq kvm_unsplit_wakeup +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) twi 31,0,0 /* Nap state must not be zero */ 33: mr r4, r3 @@ -3343,13 +3353,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_IAMR, r0 mtspr SPRN_CIABR, r0 mtspr SPRN_DAWRX0, r0 +BEGIN_FTR_SECTION + mtspr SPRN_DAWRX1, r0 +END_FTR_SECTION_IFSET(CPU_FTR_DAWR1) + + /* Clear hash and radix guest SLB, see guest_exit_short_path comment. */ + slbmte r0, r0 + PPC_SLBIA(6) BEGIN_MMU_FTR_SECTION b 4f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) - slbmte r0, r0 - slbia ptesync ld r8, PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 288a9820ec01..f38ae3e54b37 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -698,7 +698,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); r = 1; - }; + } return r; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cf52d26f49cd..6c083a9b3545 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -611,8 +611,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) && - cpu_has_feature(CPU_FTR_HVMODE)); + r = !!(hv_enabled && kvmppc_hv_ops->hash_v3_possible && + kvmppc_hv_ops->hash_v3_possible()); break; case KVM_CAP_PPC_NESTED_HV: r = !!(hv_enabled && kvmppc_hv_ops->enable_nested && @@ -678,6 +678,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = hv_enabled && kvmppc_hv_ops->enable_svm && !kvmppc_hv_ops->enable_svm(NULL); break; + case KVM_CAP_PPC_DAWR1: + r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 && + !kvmppc_hv_ops->enable_dawr1(NULL)); + break; #endif default: r = 0; @@ -2187,6 +2191,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; r = kvm->arch.kvm_ops->enable_svm(kvm); break; + case KVM_CAP_PPC_DAWR1: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(kvm) || !kvm->arch.kvm_ops->enable_dawr1) + break; + r = kvm->arch.kvm_ops->enable_dawr1(kvm); + break; #endif default: r = -EINVAL; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 74f9a036bab2..6bcfc5614bbc 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -28,7 +28,6 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 -#define KVM_USER_MEM_SLOTS 32 /* * These seem to be used for allocating ->chip in the routing table, which we diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 18f2d10c3176..474617b88648 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -170,7 +170,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl); + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); if (ret) goto out_unlock_mmap; @@ -311,7 +311,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl); + ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); if (ret) goto out_unlock_mmap; diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h index 7fc82a233f49..3a9a0b0c7465 100644 --- a/arch/sparc/include/asm/spinlock_64.h +++ b/arch/sparc/include/asm/spinlock_64.h @@ -11,8 +11,8 @@ #include <asm/processor.h> #include <asm/barrier.h> -#include <asm/qrwlock.h> #include <asm/qspinlock.h> +#include <asm/qrwlock.h> #endif /* !(__ASSEMBLY__) */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1feb6c089ba2..cc96e26d69f7 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -292,6 +292,7 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ @@ -335,6 +336,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h new file mode 100644 index 000000000000..355a2ab8fc09 --- /dev/null +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL) +BUILD_BUG_ON(1) +#endif + +/* + * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate + * "static_call()"s. They are also intended for use when defining + * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those + * functions that follow the [svm|vmx]_func_name convention. + * KVM_X86_OP_NULL() can leave a NULL definition for the + * case where there is no definition or a function name that + * doesn't match the typical naming convention is supplied. + */ +KVM_X86_OP_NULL(hardware_enable) +KVM_X86_OP_NULL(hardware_disable) +KVM_X86_OP_NULL(hardware_unsetup) +KVM_X86_OP_NULL(cpu_has_accelerated_tpr) +KVM_X86_OP(has_emulated_msr) +KVM_X86_OP(vcpu_after_set_cpuid) +KVM_X86_OP(vm_init) +KVM_X86_OP_NULL(vm_destroy) +KVM_X86_OP(vcpu_create) +KVM_X86_OP(vcpu_free) +KVM_X86_OP(vcpu_reset) +KVM_X86_OP(prepare_guest_switch) +KVM_X86_OP(vcpu_load) +KVM_X86_OP(vcpu_put) +KVM_X86_OP(update_exception_bitmap) +KVM_X86_OP(get_msr) +KVM_X86_OP(set_msr) +KVM_X86_OP(get_segment_base) +KVM_X86_OP(get_segment) +KVM_X86_OP(get_cpl) +KVM_X86_OP(set_segment) +KVM_X86_OP_NULL(get_cs_db_l_bits) +KVM_X86_OP(set_cr0) +KVM_X86_OP(is_valid_cr4) +KVM_X86_OP(set_cr4) +KVM_X86_OP(set_efer) +KVM_X86_OP(get_idt) +KVM_X86_OP(set_idt) +KVM_X86_OP(get_gdt) +KVM_X86_OP(set_gdt) +KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr7) +KVM_X86_OP(cache_reg) +KVM_X86_OP(get_rflags) +KVM_X86_OP(set_rflags) +KVM_X86_OP(tlb_flush_all) +KVM_X86_OP(tlb_flush_current) +KVM_X86_OP_NULL(tlb_remote_flush) +KVM_X86_OP_NULL(tlb_remote_flush_with_range) +KVM_X86_OP(tlb_flush_gva) +KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(run) +KVM_X86_OP_NULL(handle_exit) +KVM_X86_OP_NULL(skip_emulated_instruction) +KVM_X86_OP_NULL(update_emulated_instruction) +KVM_X86_OP(set_interrupt_shadow) +KVM_X86_OP(get_interrupt_shadow) +KVM_X86_OP(patch_hypercall) +KVM_X86_OP(set_irq) +KVM_X86_OP(set_nmi) +KVM_X86_OP(queue_exception) +KVM_X86_OP(cancel_injection) +KVM_X86_OP(interrupt_allowed) +KVM_X86_OP(nmi_allowed) +KVM_X86_OP(get_nmi_mask) +KVM_X86_OP(set_nmi_mask) +KVM_X86_OP(enable_nmi_window) +KVM_X86_OP(enable_irq_window) +KVM_X86_OP(update_cr8_intercept) +KVM_X86_OP(check_apicv_inhibit_reasons) +KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl) +KVM_X86_OP(refresh_apicv_exec_ctrl) +KVM_X86_OP(hwapic_irr_update) +KVM_X86_OP(hwapic_isr_update) +KVM_X86_OP_NULL(guest_apic_has_interrupt) +KVM_X86_OP(load_eoi_exitmap) +KVM_X86_OP(set_virtual_apic_mode) +KVM_X86_OP_NULL(set_apic_access_page_addr) +KVM_X86_OP(deliver_posted_interrupt) +KVM_X86_OP_NULL(sync_pir_to_irr) +KVM_X86_OP(set_tss_addr) +KVM_X86_OP(set_identity_map_addr) +KVM_X86_OP(get_mt_mask) +KVM_X86_OP(load_mmu_pgd) +KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(write_l1_tsc_offset) +KVM_X86_OP(get_exit_info) +KVM_X86_OP(check_intercept) +KVM_X86_OP(handle_exit_irqoff) +KVM_X86_OP_NULL(request_immediate_exit) +KVM_X86_OP(sched_in) +KVM_X86_OP_NULL(slot_enable_log_dirty) +KVM_X86_OP_NULL(slot_disable_log_dirty) +KVM_X86_OP_NULL(flush_log_dirty) +KVM_X86_OP_NULL(enable_log_dirty_pt_masked) +KVM_X86_OP_NULL(cpu_dirty_log_size) +KVM_X86_OP_NULL(pre_block) +KVM_X86_OP_NULL(post_block) +KVM_X86_OP_NULL(vcpu_blocking) +KVM_X86_OP_NULL(vcpu_unblocking) +KVM_X86_OP_NULL(update_pi_irte) +KVM_X86_OP_NULL(apicv_post_state_restore) +KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) +KVM_X86_OP_NULL(set_hv_timer) +KVM_X86_OP_NULL(cancel_hv_timer) +KVM_X86_OP(setup_mce) +KVM_X86_OP(smi_allowed) +KVM_X86_OP(pre_enter_smm) +KVM_X86_OP(pre_leave_smm) +KVM_X86_OP(enable_smi_window) +KVM_X86_OP_NULL(mem_enc_op) +KVM_X86_OP_NULL(mem_enc_reg_region) +KVM_X86_OP_NULL(mem_enc_unreg_region) +KVM_X86_OP(get_msr_feature) +KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(apic_init_signal_blocked) +KVM_X86_OP_NULL(enable_direct_tlbflush) +KVM_X86_OP_NULL(migrate_timers) +KVM_X86_OP(msr_filter_changed) +KVM_X86_OP_NULL(complete_emulated_msr) + +#undef KVM_X86_OP +#undef KVM_X86_OP_NULL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3d6616f6f6ef..84499aad01a4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,10 +40,8 @@ #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 -#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) #define KVM_HALT_POLL_NS_DEFAULT 200000 @@ -52,6 +50,9 @@ #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) +#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ + KVM_BUS_LOCK_DETECTION_EXIT) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -200,9 +201,17 @@ enum x86_intercept_stage; #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) -#define DR6_FIXED_1 0xfffe0ff0 -#define DR6_INIT 0xffff0ff0 +/* + * DR6_ACTIVE_LOW combines fixed-1 and active-low bits. + * We can regard all the bits in DR6_FIXED_1 as active_low bits; + * they will never be 0 for now, but when they are defined + * in the future it will require no code change. + * + * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. + */ +#define DR6_ACTIVE_LOW 0xffff0ff0 #define DR6_VOLATILE 0x0001e00f +#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) @@ -337,6 +346,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_HAVE_MMU_RWLOCK + struct kvm_mmu_page; /* @@ -358,8 +369,6 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); - void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte); hpa_t root_hpa; gpa_t root_pgd; union kvm_mmu_role mmu_role; @@ -510,6 +519,7 @@ struct kvm_vcpu_hv_synic { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { + struct kvm_vcpu *vcpu; u32 vp_index; u64 hv_vapic; s64 runtime_offset; @@ -520,6 +530,15 @@ struct kvm_vcpu_hv { cpumask_t tlb_flush; }; +/* Xen HVM per vcpu emulation context */ +struct kvm_vcpu_xen { + u64 hypercall_rip; + bool vcpu_info_set; + bool vcpu_time_info_set; + struct gfn_to_hva_cache vcpu_info_cache; + struct gfn_to_hva_cache vcpu_time_info_cache; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -640,7 +659,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - unsigned long cr3_lm_rsvd_bits; + u64 reserved_gpa_bits; int maxphyaddr; int max_tdp_level; @@ -717,7 +736,9 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; - struct kvm_vcpu_hv hyperv; + bool hyperv_enabled; + struct kvm_vcpu_hv *hyperv; + struct kvm_vcpu_xen xen; cpumask_var_t wbinvd_dirty_mask; @@ -888,6 +909,14 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +/* Xen emulation context */ +struct kvm_xen { + bool long_mode; + bool shinfo_set; + u8 upcall_vector; + struct gfn_to_hva_cache shinfo_cache; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -967,6 +996,7 @@ struct kvm_arch { struct hlist_head mask_notifier_list; struct kvm_hv hyperv; + struct kvm_xen xen; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; @@ -998,9 +1028,12 @@ struct kvm_arch { struct msr_bitmap_range ranges[16]; } msr_filter; + bool bus_lock_detection_enabled; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; +#ifdef CONFIG_X86_64 /* * Whether the TDP MMU is enabled for this VM. This contains a * snapshot of the TDP MMU module parameter from when the VM was @@ -1026,12 +1059,25 @@ struct kvm_arch { * tdp_mmu_page set and a root_count of 0. */ struct list_head tdp_mmu_pages; + + /* + * Protects accesses to the following fields when the MMU lock + * is held in read mode: + * - tdp_mmu_pages (above) + * - the link field of struct kvm_mmu_pages used by the TDP MMU + * - lpage_disallowed_mmu_pages + * - the lpage_disallowed_link field of struct kvm_mmu_pages used + * by the TDP MMU + * It is acceptable, but not necessary, to acquire this lock when + * the thread holds the MMU lock in write mode. + */ + spinlock_t tdp_mmu_pages_lock; +#endif /* CONFIG_X86_64 */ }; struct kvm_vm_stat { ulong mmu_shadow_zapped; ulong mmu_pte_write; - ulong mmu_pte_updated; ulong mmu_pde_zapped; ulong mmu_flooded; ulong mmu_recycled; @@ -1340,6 +1386,19 @@ extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; +#define KVM_X86_OP(func) \ + DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); +#define KVM_X86_OP_NULL KVM_X86_OP +#include <asm/kvm-x86-ops.h> + +static inline void kvm_ops_static_call_update(void) +{ +#define KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP_NULL KVM_X86_OP +#include <asm/kvm-x86-ops.h> +} + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -1351,7 +1410,7 @@ void kvm_arch_free_vm(struct kvm *kvm); static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) { if (kvm_x86_ops.tlb_remote_flush && - !kvm_x86_ops.tlb_remote_flush(kvm)) + !static_call(kvm_x86_tlb_remote_flush)(kvm)) return 0; else return -ENOTSUPP; @@ -1421,6 +1480,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits; extern u64 kvm_max_tsc_scaling_ratio; /* 1ull << kvm_tsc_scaling_ratio_frac_bits */ extern u64 kvm_default_tsc_scaling_ratio; +/* bus lock detection supported? */ +extern bool kvm_has_bus_lock_exit; extern u64 kvm_mce_cap_supported; @@ -1501,7 +1562,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); @@ -1742,14 +1803,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.vcpu_blocking) - kvm_x86_ops.vcpu_blocking(vcpu); + static_call_cond(kvm_x86_vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.vcpu_unblocking) - kvm_x86_ops.vcpu_unblocking(vcpu); + static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); } static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 9aad0e0876fb..8757078d4442 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -30,16 +30,29 @@ static inline int cpu_has_vmx(void) } -/** Disable VMX on the current CPU +/** + * cpu_vmxoff() - Disable VMX on the current CPU * - * vmxoff causes a undefined-opcode exception if vmxon was not run - * on the CPU previously. Only call this function if you know VMX - * is enabled. + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) + * + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to + * atomically track post-VMXON state, e.g. this may be called in NMI context. + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is + * magically in RM, VM86, compat mode, or at CPL>0. */ -static inline void cpu_vmxoff(void) +static inline int cpu_vmxoff(void) { - asm volatile ("vmxoff"); + asm_volatile_goto("1: vmxoff\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "cc", "memory" : fault); + + cr4_clear_bits(X86_CR4_VMXE); + return 0; + +fault: cr4_clear_bits(X86_CR4_VMXE); + return -EIO; } static inline int cpu_vmx_enabled(void) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 38ca445a8429..358707f60d99 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -73,6 +73,7 @@ #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) +#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 9915990fd8cf..d9a74681a77d 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -83,5 +83,6 @@ #define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ +#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 9139b3e86316..baca0b00ef76 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -182,6 +182,9 @@ struct arch_shared_info { unsigned long p2m_cr3; /* cr3 value of the p2m address space */ unsigned long p2m_vaddr; /* virtual address of the p2m list */ unsigned long p2m_generation; /* generation count of p2m mapping */ +#ifdef CONFIG_X86_32 + uint32_t wc_sec_hi; +#endif }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 8e76d3701db3..5a3022c8af82 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -112,6 +112,7 @@ struct kvm_ioapic_state { #define KVM_NR_IRQCHIPS 3 #define KVM_RUN_X86_SMM (1 << 0) +#define KVM_RUN_X86_BUS_LOCK (1 << 1) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index ada955c5ebb6..b8e650a985e3 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -89,6 +89,7 @@ #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 +#define EXIT_REASON_BUS_LOCK 74 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -150,7 +151,8 @@ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ - { EXIT_REASON_TPAUSE, "TPAUSE" } + { EXIT_REASON_TPAUSE, "TPAUSE" }, \ + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7f4c081f59f0..819db00c9388 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1747,6 +1747,7 @@ void apic_ap_setup(void) #ifdef CONFIG_X86_X2APIC int x2apic_mode; +EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9991c5920aac..b29657b76e3f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -547,31 +547,21 @@ static void emergency_vmx_disable_all(void) local_irq_disable(); /* - * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignores INIT - * signals when VMX is enabled. + * Disable VMX on all CPUs before rebooting, otherwise we risk hanging + * the machine, because the CPU blocks INIT when it's in VMX root. * - * We can't take any locks and we may be on an inconsistent - * state, so we use NMIs as IPIs to tell the other CPUs to disable - * VMX and halt. + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. * - * For safety, we will avoid running the nmi_shootdown_cpus() - * stuff unnecessarily, but we don't have a way to check - * if other CPUs have VMX enabled. So we will call it only if the - * CPU we are running on has VMX enabled. - * - * We will miss cases where VMX is not enabled on all CPUs. This - * shouldn't do much harm because KVM always enable VMX on all - * CPUs anyway. But we can miss it on the small window where KVM - * is still enabling VMX. + * Do the NMI shootdown even if VMX if off on _this_ CPU, as that + * doesn't prevent a different CPU from being in VMX root operation. */ - if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. */ - cpu_vmxoff(); + if (cpu_has_vmx()) { + /* Safely force _this_ CPU out of VMX root operation. */ + __cpu_emergency_vmxoff(); - /* Halt and disable VMX on the other CPUs */ + /* Halt and exit VMX root operation on the other CPUs. */ nmi_shootdown_cpus(vmxoff_nmi); - } } diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4bd14ab01323..aeab168c5711 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,10 +14,11 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/dirty_ring.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o -kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o emulate.o i8259.o irq.o lapic.o xen.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o + mmu/spte.o +kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 38172ca627d3..c8f2592ccc99 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -173,16 +173,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - kvm_mmu_reset_context(vcpu); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + kvm_hv_set_cpuid(vcpu); /* Invoke the vendor callback only after the above state is updated. */ - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); + + /* + * Except for the MMU, which needs to be reset after any vendor + * specific adjustments to the reserved GPA bits. + */ + kvm_mmu_reset_context(vcpu); } static int is_efer_nx(void) @@ -223,6 +229,16 @@ not_found: return 36; } +/* + * This "raw" version returns the reserved GPA bits without any adjustments for + * encryption technologies that usurp bits. The raw mask should be used if and + * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs. + */ +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -434,7 +450,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX512_BF16) + F(AVX_VNNI) | F(AVX512_BF16) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dc921d76e42e..2a0c5064497f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -30,15 +30,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return !(gpa & vcpu->arch.reserved_gpa_bits); +} + static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); + return !kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t alignment) +{ + return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa); +} + +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } struct cpuid_reg { @@ -324,11 +341,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } -static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature) { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 66a08322988f..f7970ba6219f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2506,12 +2506,12 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, val = GET_SMSTATE(u32, smstate, 0x7fcc); - if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 6, val)) return X86EMUL_UNHANDLEABLE; val = GET_SMSTATE(u32, smstate, 0x7fc8); - if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 7, val)) return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7fc4); @@ -2564,14 +2564,14 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smstate, 0x7f68); + val = GET_SMSTATE(u64, smstate, 0x7f68); - if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 6, val)) return X86EMUL_UNHANDLEABLE; - val = GET_SMSTATE(u32, smstate, 0x7f60); + val = GET_SMSTATE(u64, smstate, 0x7f60); - if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + if (ctxt->ops->set_dr(ctxt, 7, val)) return X86EMUL_UNHANDLEABLE; cr0 = GET_SMSTATE(u64, smstate, 0x7f58); @@ -4329,7 +4329,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_dr(ctxt, 6, &dr6); dr6 &= ~DR_TRAP_BITS; - dr6 |= DR6_BD | DR6_RTM; + dr6 |= DR6_BD | DR6_ACTIVE_LOW; ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 922c69dcca4d..7d2dae92d638 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,6 +23,7 @@ #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" +#include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> @@ -36,6 +37,9 @@ #include "trace.h" #include "irq.h" +/* "Hv#1" signature */ +#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 + #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, @@ -128,7 +132,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ - kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); + kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } @@ -141,10 +145,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); - if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } @@ -157,15 +161,15 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu) return NULL; - synic = vcpu_to_synic(vcpu); + synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; @@ -189,8 +193,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; @@ -204,7 +208,7 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; if (!synic->active && !host) @@ -282,8 +286,7 @@ static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) hv->hv_syndbg.control.status = @@ -293,8 +296,8 @@ static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; hv_vcpu->exit.u.syndbg.msr = msr; @@ -310,13 +313,13 @@ static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, - vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + to_hv_vcpu(vcpu)->vp_index, msr, data); switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: syndbg->control.control = data; @@ -349,7 +352,7 @@ static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; @@ -377,9 +380,7 @@ static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) break; } - trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, - vcpu_to_hv_vcpu(vcpu)->vp_index, msr, - *pdata); + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); return 0; } @@ -421,7 +422,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; @@ -457,7 +458,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); @@ -514,7 +515,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; @@ -534,10 +535,10 @@ static u64 get_time_ref_counter(struct kvm *kvm) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); @@ -545,14 +546,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); - trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, - vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } @@ -562,7 +563,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); - trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); @@ -579,7 +580,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) u64 time_now; ktime_t ktime_now; - time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); + time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { @@ -596,7 +597,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( - stimer_to_vcpu(stimer)->vcpu_id, + hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); @@ -618,7 +619,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) return 0; } - trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); @@ -633,13 +634,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && !host) return 1; - trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); @@ -657,13 +658,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && !host) return 1; - trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); @@ -694,7 +695,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; @@ -750,7 +751,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; @@ -763,14 +764,14 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); - return synic_deliver_msg(vcpu_to_synic(vcpu), + return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector @@ -790,7 +791,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); - trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, + trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; @@ -801,11 +802,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; + if (!hv_vcpu) + return; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; @@ -831,16 +835,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; + if (!hv_vcpu) + return; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); + + kfree(hv_vcpu); + vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } @@ -880,28 +895,41 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) stimer_prepare_msg(stimer); } -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu; int i; + hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); + if (!hv_vcpu) + return -ENOMEM; + + vcpu->arch.hyperv = hv_vcpu; + hv_vcpu->vcpu = vcpu; + synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); -} - -void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) -{ - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); + + return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { - struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_synic *synic; + int r; + + if (!to_hv_vcpu(vcpu)) { + r = kvm_hv_vcpu_init(vcpu); + if (r) + return r; + } + + synic = to_hv_synic(vcpu); /* * Hyper-V SynIC auto EOI SINT's are @@ -939,10 +967,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr) return r; } -static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 *pdata) +static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) @@ -952,41 +979,26 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, return 0; } -static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) +static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } -static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) +static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; - - if (host) - hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; + struct kvm_hv *hv = to_kvm_hv(kvm); - if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) { - - vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", - hv->hv_crash_param[0], - hv->hv_crash_param[1], - hv->hv_crash_param[2], - hv->hv_crash_param[3], - hv->hv_crash_param[4]); - - /* Send notification about crash to user space */ - kvm_make_request(KVM_REQ_HV_CRASH, vcpu); - } + hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } -static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, - u32 index, u64 data) +static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) @@ -1068,7 +1080,7 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; @@ -1078,7 +1090,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; - mutex_lock(&kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) goto out_unlock; @@ -1122,14 +1134,14 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); out_unlock: - mutex_unlock(&kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); switch (msr) { case HV_X64_MSR_GUEST_OS_ID: @@ -1139,9 +1151,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; + u8 instructions[9]; + int i = 0; + u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) @@ -1150,16 +1162,33 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, hv->hv_hypercall = data; break; } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops.patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) + + /* + * If Xen and Hyper-V hypercalls are both enabled, disambiguate + * the same way Xen itself does, by setting the bit 31 of EAX + * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just + * going to be clobbered on 64-bit. + */ + if (kvm_xen_hypercall_enabled(kvm)) { + /* orl $0x80000000, %eax */ + instructions[i++] = 0x0d; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x80; + } + + /* vmcall/vmmcall */ + static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); + i += 3; + + /* ret */ + ((unsigned char *)instructions)[i++] = 0xc3; + + addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; + if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; - mark_page_dirty(kvm, gfn); break; } case HV_X64_MSR_REFERENCE_TSC: @@ -1168,11 +1197,25 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_set_crash_data(vcpu, + return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + if (host) + return kvm_hv_msr_set_crash_ctl(kvm, data); + + if (data & HV_CRASH_CTL_CRASH_NOTIFY) { + vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", + hv->hv_crash_param[0], + hv->hv_crash_param[1], + hv->hv_crash_param[2], + hv->hv_crash_param[3], + hv->hv_crash_param[4]); + + /* Send notification about crash to user space */ + kvm_make_request(KVM_REQ_HV_CRASH, vcpu); + } + break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); @@ -1216,11 +1259,11 @@ static u64 current_task_runtime_100ns(void) static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); switch (msr) { case HV_X64_MSR_VP_INDEX: { - struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); int vcpu_idx = kvm_vcpu_get_idx(vcpu); u32 new_vp_index = (u32)data; @@ -1291,14 +1334,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); + return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), + return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: @@ -1307,7 +1350,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), + return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: @@ -1330,7 +1373,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, { u64 data = 0; struct kvm *kvm = vcpu->kvm; - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); switch (msr) { case HV_X64_MSR_GUEST_OS_ID: @@ -1346,11 +1389,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - return kvm_hv_msr_get_crash_data(vcpu, + return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: - return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; @@ -1379,7 +1422,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; - struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); switch (msr) { case HV_X64_MSR_VP_INDEX: @@ -1403,14 +1446,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); + return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; - return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), + return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: @@ -1419,7 +1462,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; - return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), + return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: @@ -1438,12 +1481,22 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!host && !vcpu->arch.hyperv_enabled) + return 1; + + if (!to_hv_vcpu(vcpu)) { + if (kvm_hv_vcpu_init(vcpu)) + return 1; + } + if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); @@ -1451,12 +1504,22 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + + if (!host && !vcpu->arch.hyperv_enabled) + return 1; + + if (!to_hv_vcpu(vcpu)) { + if (kvm_hv_vcpu_init(vcpu)) + return 1; + } + if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_lock(&hv->hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); - mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); + mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); @@ -1466,7 +1529,7 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, u64 *vp_bitmap, unsigned long *vcpu_bitmap) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; int i, bank, sbank = 0; @@ -1483,18 +1546,16 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { - if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, - (unsigned long *)vp_bitmap)) + if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) __set_bit(i, vcpu_bitmap); } return vcpu_bitmap; } -static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, - u16 rep_cnt, bool ex) +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex) { - struct kvm *kvm = current_vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1592,10 +1653,10 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, } } -static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, +static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, bool ex, bool fast) { - struct kvm *kvm = current_vcpu->kvm; + struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1666,9 +1727,20 @@ ret_success: return HV_STATUS_SUCCESS; } -bool kvm_hv_hypercall_enabled(struct kvm *kvm) +void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { - return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0; + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) + vcpu->arch.hyperv_enabled = true; + else + vcpu->arch.hyperv_enabled = false; +} + +bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) @@ -1698,6 +1770,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; if (unlikely(!fast)) { @@ -1726,7 +1799,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); - eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param); + eventfd = idr_find(&hv->conn_to_evt, param); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; @@ -1745,7 +1818,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -1793,7 +1866,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) { + if (unlikely(rep || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } @@ -1855,7 +1928,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } fallthrough; case HVCALL_RESET_DEBUG_SESSION: { - struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu)) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; @@ -1885,23 +1958,26 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) void kvm_hv_init_vm(struct kvm *kvm) { - mutex_init(&kvm->arch.hyperv.hv_lock); - idr_init(&kvm->arch.hyperv.conn_to_evt); + struct kvm_hv *hv = to_kvm_hv(kvm); + + mutex_init(&hv->hv_lock); + idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; - idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i) + idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); - idr_destroy(&kvm->arch.hyperv.conn_to_evt); + idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; @@ -1925,7 +2001,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { - struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); @@ -1997,8 +2073,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; case HYPERV_CPUID_INTERFACE: - memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12); - ent->eax = signature[0]; + ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 6d7def2b0aad..e951af1fcb2c 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -50,38 +50,46 @@ /* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */ #define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2) -static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) +static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm) { - return &vcpu->arch.hyperv; + return &kvm->arch.hyperv; } -static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu) +static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_arch *arch; - - arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv); - return container_of(arch, struct kvm_vcpu, arch); + return vcpu->arch.hyperv; } -static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu) +static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu) { - return &vcpu->arch.hyperv.synic; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return &hv_vcpu->synic; } -static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) +static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) { - return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); + struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic); + + return hv_vcpu->vcpu; } -static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu) { return &vcpu->kvm->arch.hyperv.hv_syndbg; } +static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu); +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); -bool kvm_hv_hypercall_enabled(struct kvm *kvm); +bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); @@ -89,32 +97,35 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); -void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, struct hv_vp_assist_page *assist_page); -static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, - int timer_index) +static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu, + int timer_index) { - return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index]; + return &to_hv_vcpu(vcpu)->stimer[timer_index]; } -static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) +static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu_hv *hv_vcpu; hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv, stimer[0]); - return hv_vcpu_to_vcpu(hv_vcpu); + return hv_vcpu->vcpu; } static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) { - return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap, + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + if (!hv_vcpu) + return false; + + return !bitmap_empty(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); } @@ -125,6 +136,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); +void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 814698e5b152..172b05343cfd 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -14,6 +14,7 @@ #include "irq.h" #include "i8254.h" #include "x86.h" +#include "xen.h" /* * check if there are pending timer events @@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.injected; + if (kvm_xen_has_interrupt(v)) + return 1; + if (!kvm_apic_accept_pic_intr(v)) return 0; @@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; + if (kvm_xen_has_interrupt(v)) + return v->kvm->arch.xen.upcall_vector; + if (irqchip_split(v->kvm)) { int vector = v->arch.pending_external_vector; @@ -143,8 +150,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); - if (kvm_x86_ops.migrate_timers) - kvm_x86_ops.migrate_timers(vcpu); + static_call_cond(kvm_x86_migrate_timers)(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index a889563ad02d..2e11da2f5621 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -68,7 +68,7 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) return 0; if (!kvm_register_is_available(vcpu, reg)) - kvm_x86_ops.cache_reg(vcpu, reg); + static_call(kvm_x86_cache_reg)(vcpu, reg); return vcpu->arch.regs[reg]; } @@ -108,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -118,7 +118,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if ((tmask & vcpu->arch.cr0_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } @@ -132,14 +132,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if ((tmask & vcpu->arch.cr4_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3); + static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 43c93ffa76ed..0d359115429a 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -205,7 +205,7 @@ struct x86_emulate_ops { ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); - int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43cceadd073e..45d40bfacb7c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -91,8 +91,8 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } -struct static_key_deferred apic_hw_disabled __read_mostly; -struct static_key_deferred apic_sw_disabled __read_mostly; +__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); +__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); static inline int apic_enabled(struct kvm_lapic *apic) { @@ -290,9 +290,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; if (enabled) - static_key_slow_dec_deferred(&apic_sw_disabled); + static_branch_slow_dec_deferred(&apic_sw_disabled); else - static_key_slow_inc(&apic_sw_disabled.key); + static_branch_inc(&apic_sw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } @@ -484,7 +484,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_x86_ops.hwapic_irr_update(vcpu, + static_call(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; @@ -515,7 +515,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops.hwapic_isr_update(vcpu, vec); + static_call(kvm_x86_hwapic_isr_update)(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -563,8 +563,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops.hwapic_isr_update(vcpu, - apic_find_highest_isr(apic)); + static_call(kvm_x86_hwapic_isr_update)(vcpu, + apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -701,7 +701,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (apic->vcpu->arch.apicv_active) - highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu); + highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) @@ -1090,7 +1090,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) { + if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) { kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -1245,7 +1245,8 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap)) + if (to_hv_vcpu(apic->vcpu) && + test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap)) kvm_hv_synic_send_eoi(apic->vcpu, vector); kvm_ioapic_send_eoi(apic, vector); @@ -1814,7 +1815,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - kvm_x86_ops.cancel_hv_timer(apic->vcpu); + static_call(kvm_x86_cancel_hv_timer)(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } @@ -1831,7 +1832,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!ktimer->tscdeadline) return false; - if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired)) + if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; @@ -2175,10 +2176,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) - static_key_slow_dec_deferred(&apic_hw_disabled); + static_branch_slow_dec_deferred(&apic_hw_disabled); if (!apic->sw_enabled) - static_key_slow_dec_deferred(&apic_sw_disabled); + static_branch_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) free_page((unsigned long)apic->regs); @@ -2250,9 +2251,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - static_key_slow_dec_deferred(&apic_hw_disabled); + static_branch_slow_dec_deferred(&apic_hw_disabled); } else { - static_key_slow_inc(&apic_hw_disabled.key); + static_branch_inc(&apic_hw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } } @@ -2261,7 +2262,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) - kvm_x86_ops.set_virtual_apic_mode(vcpu); + static_call(kvm_x86_set_virtual_apic_mode)(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2338,9 +2339,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { - kvm_x86_ops.apicv_post_state_restore(vcpu); - kvm_x86_ops.hwapic_irr_update(vcpu, -1); - kvm_x86_ops.hwapic_isr_update(vcpu, -1); + static_call(kvm_x86_apicv_post_state_restore)(vcpu); + static_call(kvm_x86_hwapic_irr_update)(vcpu, -1); + static_call(kvm_x86_hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2449,7 +2450,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) * thinking that APIC state has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ + static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; @@ -2512,7 +2513,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) */ apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { + if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another @@ -2601,10 +2602,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - kvm_x86_ops.apicv_post_state_restore(vcpu); - kvm_x86_ops.hwapic_irr_update(vcpu, + static_call(kvm_x86_apicv_post_state_restore)(vcpu); + static_call(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops.hwapic_isr_update(vcpu, + static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -2904,13 +2905,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) } } -void kvm_lapic_init(void) -{ - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&apic_hw_disabled, HZ); - jump_label_rate_limit(&apic_sw_disabled, HZ); -} - void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4fb86e3a9dd3..997c45a5963a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -6,6 +6,8 @@ #include <linux/kvm_host.h> +#include "hyperv.h" + #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 #define KVM_APIC_LVT_NUM 6 @@ -125,13 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; -} - int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); -void kvm_lapic_init(void); void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) @@ -172,29 +168,29 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va __kvm_lapic_set_reg(apic->regs, reg_off, val); } -extern struct static_key kvm_no_apic_vcpu; +DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) { - if (static_key_false(&kvm_no_apic_vcpu)) + if (static_branch_unlikely(&kvm_has_noapic_vcpu)) return vcpu->arch.apic; return true; } -extern struct static_key_deferred apic_hw_disabled; +extern struct static_key_false_deferred apic_hw_disabled; static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_hw_disabled.key)) + if (static_branch_unlikely(&apic_hw_disabled.key)) return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; return MSR_IA32_APICBASE_ENABLE; } -extern struct static_key_deferred apic_sw_disabled; +extern struct static_key_false_deferred apic_sw_disabled; static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) { - if (static_key_false(&apic_sw_disabled.key)) + if (static_branch_unlikely(&apic_sw_disabled.key)) return apic->sw_enabled; return true; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 261be1d2032b..c68bfc3e2402 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -102,7 +102,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root_hpa)) return; - kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu), + static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu), vcpu->arch.mmu->shadow_root_level); } @@ -152,7 +152,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * * TODO: introduce APIs to split these two cases. */ -static inline int is_writable_pte(unsigned long pte) +static inline bool is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -174,8 +174,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pte_access, unsigned pte_pkey, unsigned pfec) { - int cpl = kvm_x86_ops.get_cpl(vcpu); - unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + int cpl = static_call(kvm_x86_get_cpl)(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); /* * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..e507568cd55d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -190,7 +190,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, int ret = -ENOTSUPP; if (range && kvm_x86_ops.tlb_remote_flush_with_range) - ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range); + ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); if (ret) kvm_flush_remote_tlbs(kvm); @@ -844,17 +844,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, int i, count = 0; if (!rmap_head->val) { - rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); + rmap_printk("%p %llx 0->1\n", spte, *spte); rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { - rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); + rmap_printk("%p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; rmap_head->val = (unsigned long)desc | 1; ++count; } else { - rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); + rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc->sptes[PTE_LIST_EXT-1]) { count += PTE_LIST_EXT; @@ -906,14 +906,14 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) pr_err("%s: %p 0->BUG\n", __func__, spte); BUG(); } else if (!(rmap_head->val & 1)) { - rmap_printk("%s: %p 1->0\n", __func__, spte); + rmap_printk("%p 1->0\n", spte); if ((u64 *)rmap_head->val != spte) { pr_err("%s: %p 1->BUG\n", __func__, spte); BUG(); } rmap_head->val = 0; } else { - rmap_printk("%s: %p many->many\n", __func__, spte); + rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { @@ -1115,7 +1115,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + rmap_printk("spte %p %llx\n", sptep, *sptep); if (pt_protect) spte &= ~SPTE_MMU_WRITEABLE; @@ -1142,7 +1142,7 @@ static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + rmap_printk("spte %p %llx\n", sptep, *sptep); MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; @@ -1184,7 +1184,7 @@ static bool spte_set_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + rmap_printk("spte %p %llx\n", sptep, *sptep); /* * Similar to the !kvm_x86_ops.slot_disable_log_dirty case, @@ -1225,7 +1225,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); while (mask) { @@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); while (mask) { @@ -1283,8 +1283,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask) { if (kvm_x86_ops.enable_log_dirty_pt_masked) - kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset, - mask); + static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot, + gfn_offset, + mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } @@ -1292,7 +1293,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, int kvm_cpu_dirty_log_size(void) { if (kvm_x86_ops.cpu_dirty_log_size) - return kvm_x86_ops.cpu_dirty_log_size(); + return static_call(kvm_x86_cpu_dirty_log_size)(); return 0; } @@ -1309,7 +1310,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); @@ -1331,7 +1332,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); + rmap_printk("spte %p %llx.\n", sptep, *sptep); pte_list_remove(rmap_head, sptep); flush = true; @@ -1363,7 +1364,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", + rmap_printk("spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); need_flush = 1; @@ -1456,16 +1457,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +static __always_inline int +kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, + gfn_t gfn, + int level, + unsigned long data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1521,7 +1523,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); return r; @@ -1533,7 +1535,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); return r; @@ -1588,7 +1590,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) int young = false; young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); return young; @@ -1599,7 +1601,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) int young = false; young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_hva(kvm, hva); return young; @@ -1723,13 +1725,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_update_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *pte) -{ - WARN_ON(1); -} - #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -2016,9 +2011,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); - cond_resched_lock(&vcpu->kvm->mmu_lock); + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } @@ -2417,7 +2412,7 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, return 0; restart: - list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { /* * Don't zap active root pages, the page itself can't be freed * and zapping it will just force vCPUs to realloc and reload. @@ -2470,7 +2465,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - @@ -2481,7 +2476,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -2492,7 +2487,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); @@ -2500,7 +2495,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return r; } @@ -3161,7 +3156,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); if (kvm_mmu_put_root(kvm, sp)) { - if (sp->tdp_mmu_page) + if (is_tdp_mmu_page(sp)) kvm_tdp_mmu_free_root(kvm, sp); else if (sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3192,7 +3187,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3215,7 +3210,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3236,16 +3231,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu)) { - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return INVALID_PAGE; } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } @@ -3255,7 +3250,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (vcpu->kvm->arch.tdp_mmu_enabled) { + if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); if (!VALID_PAGE(root)) @@ -3416,17 +3411,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) !smp_load_acquire(&sp->unsync_children)) return; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); mmu_sync_children(vcpu, sp); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { @@ -3440,7 +3435,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); @@ -3724,7 +3719,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return r; r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + read_lock(&vcpu->kvm->mmu_lock); + else + write_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; r = make_mmu_pages_available(vcpu); @@ -3739,7 +3739,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + read_unlock(&vcpu->kvm->mmu_lock); + else + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -3813,7 +3816,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = true; @@ -3984,20 +3986,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, static void __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, int level, bool nx, bool gbpages, + u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { - u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; + u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; - if (!nx) - exb_bit_rsvd = rsvd_bits(63, 63); if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); + if (level == PT32E_ROOT_LEVEL) + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); + else + high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + + /* Note, NX doesn't exist in PDPTEs, this is handled below. */ + if (!nx) + high_bits_rsvd |= rsvd_bits(63, 63); + /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. @@ -4026,45 +4035,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 63) | - rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PDE */ - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PTE */ - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | + high_bits_rsvd | + rsvd_bits(5, 8) | + rsvd_bits(1, 2); /* PDPTE */ + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: - rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: - rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | + nonleaf_bit8_rsvd | + rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | + gbpages_bit_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 29); - rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | + gbpages_bit_rsvd | + rsvd_bits(13, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; @@ -4075,8 +4078,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), context->root_level, - context->nx, + vcpu->arch.reserved_gpa_bits, + context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), guest_cpuid_is_amd_or_hygon(vcpu)); @@ -4084,27 +4087,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, bool execonly) + u64 pa_bits_rsvd, bool execonly) { + u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 bad_mt_xwr; - rsvd_check->rsvd_bits_mask[0][4] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][3] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4123,7 +4121,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), execonly); + vcpu->arch.reserved_gpa_bits, execonly); +} + +static inline u64 reserved_hpa_bits(void) +{ + return rsvd_bits(shadow_phys_bits, 63); } /* @@ -4145,7 +4148,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + reserved_hpa_bits(), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4182,14 +4185,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - shadow_phys_bits, + reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - shadow_phys_bits, - false); + reserved_hpa_bits(), false); if (!shadow_me_mask) return; @@ -4209,7 +4211,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - shadow_phys_bits, execonly); + reserved_hpa_bits(), execonly); } #define BYTE_MASK(access) \ @@ -4395,7 +4397,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->update_pte = paging64_update_pte; context->shadow_root_level = level; context->direct_map = false; } @@ -4424,7 +4425,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = false; } @@ -4506,7 +4506,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; @@ -4678,7 +4677,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; - context->update_pte = ept_update_pte; context->root_level = level; context->direct_map = false; context->mmu_role.as_u64 = new_role.as_u64; @@ -4811,7 +4809,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_pgd(vcpu); - kvm_x86_ops.tlb_flush_current(vcpu); + static_call(kvm_x86_tlb_flush_current)(vcpu); out: return r; } @@ -4826,19 +4824,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_unload); -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *new) -{ - if (sp->role.level != PG_LEVEL_4K) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } - - ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu->update_pte(vcpu, sp, spte, new); -} - static bool need_remote_flush(u64 old, u64 new) { if (!is_shadow_present_pte(old)) @@ -4954,22 +4939,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -/* - * Ignore various flags when determining if a SPTE can be immediately - * overwritten for the current MMU. - * - level: explicitly checked in mmu_pte_write_new_pte(), and will never - * match the current MMU role, as MMU's level tracks the root level. - * - access: updated based on the new guest PTE - * - quadrant: handled by get_written_sptes() - * - invalid: always false (loop only walks valid shadow pages) - */ -static const union kvm_mmu_page_role role_ign = { - .level = 0xf, - .access = 0x7, - .quadrant = 0x3, - .invalid = 0x1, -}; - static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, struct kvm_page_track_notifier_node *node) @@ -4999,7 +4968,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ mmu_topup_memory_caches(vcpu, true); - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5020,14 +4989,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, local_flush = true; while (npte--) { - u32 base_role = vcpu->arch.mmu->mmu_role.base.word; - entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); - if (gentry && - !((sp->role.word ^ base_role) & ~role_ign.word) && - rmap_can_add(vcpu)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + if (gentry && sp->role.level != PG_LEVEL_4K) + ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; @@ -5035,7 +5000,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) @@ -5125,7 +5090,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(gva, vcpu)) return; - kvm_x86_ops.tlb_flush_gva(vcpu, gva); + static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); } if (!mmu->invlpg) @@ -5182,7 +5147,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - kvm_x86_ops.tlb_flush_gva(vcpu, gva); + static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); ++vcpu->stat.invlpg; @@ -5233,14 +5198,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, if (iterator.rmap) flush |= fn(kvm, iterator.rmap); - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { kvm_flush_remote_tlbs_with_address(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); } } @@ -5390,7 +5355,7 @@ restart: * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } @@ -5423,7 +5388,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* @@ -5447,10 +5412,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) @@ -5492,7 +5457,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5510,13 +5475,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } - if (kvm->arch.tdp_mmu_enabled) { + if (is_tdp_mmu_enabled(kvm)) { flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); if (flush) kvm_flush_remote_tlbs(kvm); } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, @@ -5531,12 +5496,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); /* * We can flush all the TLBs out of the mmu lock without TLB @@ -5596,13 +5561,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot) { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, @@ -5625,11 +5590,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); /* * It's also safe to flush TLBs out of mmu lock here as currently this @@ -5647,12 +5612,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5664,11 +5629,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, { bool flush; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5681,23 +5646,23 @@ void kvm_mmu_zap_all(struct kvm *kvm) LIST_HEAD(invalid_list); int ign; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - if (cond_resched_lock(&kvm->mmu_lock)) + if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (kvm->arch.tdp_mmu_enabled) + if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_zap_all(kvm); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) @@ -5757,7 +5722,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) continue; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvm_has_zapped_obsolete_pages(kvm)) { kvm_mmu_commit_zap_page(kvm, @@ -5768,7 +5733,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); /* @@ -5988,7 +5953,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; @@ -6005,22 +5970,22 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - if (sp->tdp_mmu_page) + if (is_tdp_mmu_page(sp)) { kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); - else { + } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); + cond_resched_rwlock_write(&kvm->mmu_lock); } } kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index c8d51a37e2ce..ced15fd58fde 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct static_key mmu_audit_key; +static DEFINE_STATIC_KEY_FALSE(mmu_audit_key); static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_key_false((&mmu_audit_key))) + if (static_branch_unlikely((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - static_key_slow_inc(&mmu_audit_key); + static_branch_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - static_key_slow_dec(&mmu_audit_key); + static_branch_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bfc6389edc28..9e38d3c5daad 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -12,7 +12,7 @@ extern bool dbg; #define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0) #define MMU_WARN_ON(x) WARN_ON(x) #else #define pgprintk(x...) do { } while (0) @@ -56,7 +56,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; +#ifdef CONFIG_X86_64 bool tdp_mmu_page; + + /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + struct rcu_head rcu_head; +#endif }; extern struct kmem_cache *mmu_page_header_cache; diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 8443a675715b..34bb0ec69bd8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, head = &kvm->arch.track_notifier_head; - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); hlist_del_rcu(&n->node); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 50e268eb8e1a..d9f66cc459e8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, } r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } @@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) return; } - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c51ad544f25b..ef55f0bc4ccf 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -120,7 +120,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 2b3a30bd38b0..6de3950fd704 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -131,6 +131,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT /* + * If a thread running without exclusive control of the MMU lock must perform a + * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * non-present intermediate value. Other threads which encounter this value + * should not modify the SPTE. + * + * This constant works because it is considered non-present on both AMD and + * Intel CPUs and does not create a L1TF vulnerability because the pfn section + * is zeroed out. + * + * Only used by the TDP MMU. + */ +#define REMOVED_SPTE (1ull << 59) + +static inline bool is_removed_spte(u64 spte) +{ + return spte == REMOVED_SPTE; +} + +/* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll @@ -185,23 +204,19 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline int is_shadow_present_pte(u64 pte) +static inline bool is_shadow_present_pte(u64 pte) { - return (pte != 0) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte); } -static inline int is_large_pte(u64 pte) +static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; } -static inline int is_last_spte(u64 pte, int level) +static inline bool is_last_spte(u64 pte, int level) { - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; + return (level == PG_LEVEL_4K) || is_large_pte(pte); } static inline bool is_executable_pte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 87b7e16911db..e5f148106e20 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); } static gfn_t round_gfn_for_level(gfn_t gfn, int level) @@ -22,21 +22,22 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) /* * Sets a TDP iterator to walk a pre-order traversal of the paging structure - * rooted at root_pt, starting with the walk to translate goal_gfn. + * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, - int min_level, gfn_t goal_gfn) + int min_level, gfn_t next_last_level_gfn) { WARN_ON(root_level < 1); WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); - iter->goal_gfn = goal_gfn; + iter->next_last_level_gfn = next_last_level_gfn; + iter->yielded_gfn = iter->next_last_level_gfn; iter->root_level = root_level; iter->min_level = min_level; iter->level = root_level; - iter->pt_path[iter->level - 1] = root_pt; + iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt; - iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); iter->valid = true; @@ -47,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, * address of the child page table referenced by the SPTE. Returns null if * there is no such entry. */ -u64 *spte_to_child_pt(u64 spte, int level) +tdp_ptep_t spte_to_child_pt(u64 spte, int level) { /* * There's no child entry if this entry isn't present or is a @@ -56,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level) if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) return NULL; - return __va(spte_to_pfn(spte) << PAGE_SHIFT); + return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT); } /* @@ -65,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level) */ static bool try_step_down(struct tdp_iter *iter) { - u64 *child_pt; + tdp_ptep_t child_pt; if (iter->level == iter->min_level) return false; @@ -74,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter) * Reread the SPTE before stepping down to avoid traversing into page * tables that are no longer linked from this entry. */ - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); child_pt = spte_to_child_pt(iter->old_spte, iter->level); if (!child_pt) @@ -82,7 +83,7 @@ static bool try_step_down(struct tdp_iter *iter) iter->level--; iter->pt_path[iter->level - 1] = child_pt; - iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; @@ -106,9 +107,9 @@ static bool try_step_side(struct tdp_iter *iter) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); - iter->goal_gfn = iter->gfn; + iter->next_last_level_gfn = iter->gfn; iter->sptep++; - iter->old_spte = READ_ONCE(*iter->sptep); + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep)); return true; } @@ -158,24 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter) iter->valid = false; } -/* - * Restart the walk over the paging structure from the root, starting from the - * highest gfn the iterator had previously reached. Assumes that the entire - * paging structure, except the root page, may have been completely torn down - * and rebuilt. - */ -void tdp_iter_refresh_walk(struct tdp_iter *iter) -{ - gfn_t goal_gfn = iter->goal_gfn; - - if (iter->gfn > goal_gfn) - goal_gfn = iter->gfn; - - tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], - iter->root_level, iter->min_level, goal_gfn); -} - -u64 *tdp_iter_root_pt(struct tdp_iter *iter) +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter) { return iter->pt_path[iter->root_level - 1]; } diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 47170d0dc98e..4cc177d75c4a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -7,6 +7,8 @@ #include "mmu.h" +typedef u64 __rcu *tdp_ptep_t; + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ @@ -15,11 +17,17 @@ struct tdp_iter { * The iterator will traverse the paging structure towards the mapping * for this GFN. */ - gfn_t goal_gfn; + gfn_t next_last_level_gfn; + /* + * The next_last_level_gfn at the time when the thread last + * yielded. Only yielding when the next_last_level_gfn != + * yielded_gfn helps ensure forward progress. + */ + gfn_t yielded_gfn; /* Pointers to the page tables traversed to reach the current SPTE */ - u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ - u64 *sptep; + tdp_ptep_t sptep; /* The lowest GFN mapped by the current SPTE */ gfn_t gfn; /* The level of the root page given to the iterator */ @@ -49,12 +57,11 @@ struct tdp_iter { #define for_each_tdp_pte(iter, root, root_level, start, end) \ for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) -u64 *spte_to_child_pt(u64 pte, int level); +tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, - int min_level, gfn_t goal_gfn); + int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); -void tdp_iter_refresh_walk(struct tdp_iter *iter); -u64 *tdp_iter_root_pt(struct tdp_iter *iter); +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index b56d604809b8..71e100a5670f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,32 +7,23 @@ #include "tdp_mmu.h" #include "spte.h" +#include <asm/cmpxchg.h> #include <trace/events/kvm.h> -#ifdef CONFIG_X86_64 static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); -#endif - -static bool is_tdp_mmu_enabled(void) -{ -#ifdef CONFIG_X86_64 - return tdp_enabled && READ_ONCE(tdp_mmu_enabled); -#else - return false; -#endif /* CONFIG_X86_64 */ -} /* Initializes the TDP MMU for the VM, if enabled. */ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - if (!is_tdp_mmu_enabled()) + if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) return; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); } @@ -42,6 +33,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) return; WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); + + /* + * Ensure that all the outstanding RCU callbacks to free shadow pages + * can run before the VM is torn down. + */ + rcu_barrier(); } static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) @@ -53,7 +50,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, struct kvm_mmu_page *root) { - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) return false; @@ -88,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, #define for_each_tdp_mmu_root(_kvm, _root) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) -{ - struct kvm_mmu_page *sp; - - if (!kvm->arch.tdp_mmu_enabled) - return false; - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; - - return sp->tdp_mmu_page && sp->root_count; -} - static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield); @@ -111,7 +92,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(root->root_count); WARN_ON(!root->tdp_mmu_page); @@ -164,13 +145,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); /* Check for an existing root before allocating a new one. */ for_each_tdp_mmu_root(kvm, root) { if (root->role.word == role.word) { kvm_mmu_get_root(kvm, root); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return root; } } @@ -180,7 +161,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) list_add(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); return root; } @@ -196,8 +177,31 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) return __pa(root->spt); } +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) +{ + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); +} + +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) +{ + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); + + tdp_mmu_free_sp(sp); +} + static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level); + u64 old_spte, u64 new_spte, int level, + bool shared); static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) { @@ -235,6 +239,128 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, } /** + * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the new page + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. + * @account_nx: This page replaces a NX large page and should be marked for + * eventual reclaim. + */ +static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared, bool account_nx) +{ + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); + + list_add(&sp->link, &kvm->arch.tdp_mmu_pages); + if (account_nx) + account_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +/** + * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU + * + * @kvm: kvm instance + * @sp: the page to be removed + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be adding or removing pages. + */ +static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool shared) +{ + if (shared) + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); + + list_del(&sp->link); + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + if (shared) + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +/** + * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure + * + * @kvm: kvm instance + * @pt: the page removed from the paging structure + * @shared: This operation may not be running under the exclusive use + * of the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. + * + * Given a page table that has been removed from the TDP paging structure, + * iterates through the page table to clear SPTEs and free child page tables. + */ +static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, + bool shared) +{ + struct kvm_mmu_page *sp = sptep_to_sp(pt); + int level = sp->role.level; + gfn_t base_gfn = sp->gfn; + u64 old_child_spte; + u64 *sptep; + gfn_t gfn; + int i; + + trace_kvm_mmu_prepare_zap_page(sp); + + tdp_mmu_unlink_page(kvm, sp, shared); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + sptep = pt + i; + gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); + + if (shared) { + /* + * Set the SPTE to a nonpresent value that other + * threads will not overwrite. If the SPTE was + * already marked as removed then another thread + * handling a page fault could overwrite it, so + * set the SPTE until it is set from some other + * value to the removed SPTE value. + */ + for (;;) { + old_child_spte = xchg(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_child_spte)) + break; + cpu_relax(); + } + } else { + old_child_spte = READ_ONCE(*sptep); + + /* + * Marking the SPTE as a removed SPTE is not + * strictly necessary here as the MMU lock will + * stop other threads from concurrently modifying + * this SPTE. Using the removed SPTE value keeps + * the two branches consistent and simplifies + * the function. + */ + WRITE_ONCE(*sptep, REMOVED_SPTE); + } + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, + old_child_spte, REMOVED_SPTE, level - 1, + shared); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); +} + +/** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of @@ -242,22 +368,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change * @level: the level of the PT the SPTE is part of in the paging structure + * @shared: This operation may not be running under the exclusive use of + * the MMU lock and the operation must synchronize with other + * threads that might be modifying SPTEs. * * Handle bookkeeping that might result from the modification of a SPTE. * This function must be called for all TDP SPTE modifications. */ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - u64 *pt; - struct kvm_mmu_page *sp; - u64 old_child_spte; - int i; WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -298,15 +424,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE, it is - * unexpected. Log the change, though it should not impact the - * guest since both the former and current SPTEs are nonpresent. + * If this change does not involve a MMIO SPTE or removed SPTE, + * it is unexpected. Log the change, though it should not + * impact the guest since both the former and current SPTEs + * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + if (WARN_ON(!is_mmio_spte(old_spte) && + !is_mmio_spte(new_spte) && + !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" - "are MMIO SPTEs.\n" + "are MMIO SPTEs, or the new SPTE is\n" + "a temporary removed SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -321,54 +451,127 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Recursively handle child PTs if the change removed a subtree from * the paging structure. */ - if (was_present && !was_leaf && (pfn_changed || !is_present)) { - pt = spte_to_child_pt(old_spte, level); - sp = sptep_to_sp(pt); + if (was_present && !was_leaf && (pfn_changed || !is_present)) + handle_removed_tdp_mmu_page(kvm, + spte_to_child_pt(old_spte, level), shared); +} - trace_kvm_mmu_prepare_zap_page(sp); +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, + shared); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} - list_del(&sp->link); +/* + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the + * associated bookkeeping + * + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + lockdep_assert_held_read(&kvm->mmu_lock); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - old_child_spte = READ_ONCE(*(pt + i)); - WRITE_ONCE(*(pt + i), 0); - handle_changed_spte(kvm, as_id, - gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), - old_child_spte, 0, level - 1); - } + /* + * Do not change removed SPTEs. Only the thread that froze the SPTE + * may modify it. + */ + if (iter->old_spte == REMOVED_SPTE) + return false; - kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, + new_spte) != iter->old_spte) + return false; - free_page((unsigned long)pt); - kmem_cache_free(mmu_page_header_cache, sp); - } + handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level, true); + + return true; } -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) +static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + /* + * Freeze the SPTE by setting it to a special, + * non-present value. This will stop other threads from + * immediately installing a present entry in its place + * before the TLBs are flushed. + */ + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + return false; + + kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, + KVM_PAGES_PER_HPAGE(iter->level)); + + /* + * No other thread can overwrite the removed SPTE as they + * must either wait on the MMU lock or use + * tdp_mmu_set_spte_atomic which will not overrite the + * special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present + * to non-present. + */ + WRITE_ONCE(*iter->sptep, 0); + + return true; } + +/* + * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * @kvm: kvm instance + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * @record_acc_track: Notify the MM subsystem of changes to the accessed state + * of the page. Should be set unless handling an MMU + * notifier for access tracking. Leaving record_acc_track + * unset in that case prevents page accesses from being + * double counted. + * @record_dirty_log: Record the page as dirty in the dirty bitmap if + * appropriate for the change being made. Should be set + * unless performing certain dirty logging operations. + * Leaving record_dirty_log unset in that case prevents page + * writes from being double counted. + */ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { - u64 *root_pt = tdp_iter_root_pt(iter); + tdp_ptep_t root_pt = tdp_iter_root_pt(iter); struct kvm_mmu_page *root = sptep_to_sp(root_pt); int as_id = kvm_mmu_page_as_id(root); - WRITE_ONCE(*iter->sptep, new_spte); + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * No thread should be using this function to set SPTEs to the + * temporary removed SPTE value. + * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic + * should be used. If operating under the MMU lock in write mode, the + * use of the removed SPTE should not be necessary. + */ + WARN_ON(iter->old_spte == REMOVED_SPTE); + + WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level); + iter->level, false); if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); @@ -413,27 +616,46 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, _mmu->shadow_root_level, _start, _end) /* - * Flush the TLB if the process should drop kvm->mmu_lock. - * Return whether the caller still needs to flush the tlb. + * Yield if the MMU lock is contended or this thread needs to return control + * to the scheduler. + * + * If this function should yield and flush is set, it will perform a remote + * TLB flush before yielding. + * + * If this function yields, it will also reset the tdp_iter's walk over the + * paging structure and the calling function should skip to the next + * iteration to allow the iterator to continue its traversal from the + * paging structure root. + * + * Return true if this function yielded and the iterator's traversal was reset. + * Return false if a yield was not needed. */ -static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, bool flush) { - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_flush_remote_tlbs(kvm); - cond_resched_lock(&kvm->mmu_lock); - tdp_iter_refresh_walk(iter); + /* Ensure forward progress has been made before yielding. */ + if (iter->next_last_level_gfn == iter->yielded_gfn) return false; - } else { + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + rcu_read_unlock(); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_lock(); + + WARN_ON(iter->gfn > iter->next_last_level_gfn); + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], + iter->root_level, iter->min_level, + iter->next_last_level_gfn); + return true; } -} -static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) -{ - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - cond_resched_lock(&kvm->mmu_lock); - tdp_iter_refresh_walk(iter); - } + return false; } /* @@ -453,7 +675,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; bool flush_needed = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { + if (can_yield && + tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { + flush_needed = false; + continue; + } + if (!is_shadow_present_pte(iter.old_spte)) continue; @@ -468,12 +698,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - - if (can_yield) - flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); - else - flush_needed = true; + flush_needed = true; } + + rcu_read_unlock(); return flush_needed; } @@ -517,21 +745,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, int ret = 0; int make_spte_ret = 0; - if (unlikely(is_noslot_pfn(pfn))) { + if (unlikely(is_noslot_pfn(pfn))) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); - trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); - } else { + else make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, pfn, iter->old_spte, prefault, true, map_writable, !shadow_accessed_mask, &new_spte); - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); - } if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else - tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + return RET_PF_RETRY; /* * If the page fault was caused by a write but the page is write @@ -545,10 +770,16 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) + if (unlikely(is_mmio_spte(new_spte))) { + trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, + new_spte); ret = RET_PF_EMULATE; + } else + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); - trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + trace_kvm_mmu_set_spte(iter->level, iter->gfn, + rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; @@ -586,6 +817,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); + + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { if (nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(iter.old_spte, gfn, @@ -601,49 +835,61 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, */ if (is_shadow_present_pte(iter.old_spte) && is_large_pte(iter.old_spte)) { - tdp_mmu_set_spte(vcpu->kvm, &iter, 0); - - kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, - KVM_PAGES_PER_HPAGE(iter.level)); + if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) + break; /* * The iter must explicitly re-read the spte here * because the new value informs the !present * path below. */ - iter.old_spte = READ_ONCE(*iter.sptep); + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); } if (!is_shadow_present_pte(iter.old_spte)) { sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); - list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); child_pt = sp->spt; - clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - trace_kvm_mmu_get_page(sp, true); - if (huge_page_disallowed && req_level >= iter.level) - account_huge_nx_page(vcpu->kvm, sp); - - tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, + new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, true, + huge_page_disallowed && + req_level >= iter.level); + + trace_kvm_mmu_get_page(sp, true); + } else { + tdp_mmu_free_sp(sp); + break; + } } } - if (WARN_ON(iter.level != level)) + if (iter.level != level) { + rcu_read_unlock(); return RET_PF_RETRY; + } ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, pfn, prefault); + rcu_read_unlock(); return ret; } -static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end, unsigned long data, - int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long data)) +static __always_inline int +kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, + gfn_t start, + gfn_t end, + unsigned long data)) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -705,6 +951,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, int young = 0; u64 new_spte = 0; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { /* * If we have a non-accessed entry we don't need to change the @@ -736,6 +984,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, trace_kvm_age_page(iter.gfn, iter.level, slot, young); } + rcu_read_unlock(); + return young; } @@ -781,6 +1031,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, u64 new_spte; int need_flush = 0; + rcu_read_lock(); + WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); @@ -809,6 +1061,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, if (need_flush) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + rcu_read_unlock(); + return 0; } @@ -832,21 +1086,27 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); for_each_tdp_pte_min_level(iter, root->spt, root->role.level, min_level, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; + if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + !is_last_spte(iter.old_spte, iter.level) || + !(iter.old_spte & PT_WRITABLE_MASK)) continue; new_spte = iter.old_spte & ~PT_WRITABLE_MASK; tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; - - tdp_mmu_iter_cond_resched(kvm, &iter); } + + rcu_read_unlock(); return spte_set; } @@ -888,7 +1148,12 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; + if (spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -903,9 +1168,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); spte_set = true; - - tdp_mmu_iter_cond_resched(kvm, &iter); } + + rcu_read_unlock(); return spte_set; } @@ -947,6 +1212,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, struct tdp_iter iter; u64 new_spte; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) @@ -956,6 +1223,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, !(mask & (1UL << (iter.gfn - gfn)))) continue; + mask &= ~(1UL << (iter.gfn - gfn)); + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { if (is_writable_pte(iter.old_spte)) new_spte = iter.old_spte & ~PT_WRITABLE_MASK; @@ -969,9 +1238,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); - - mask &= ~(1UL << (iter.gfn - gfn)); } + + rcu_read_unlock(); } /* @@ -989,7 +1258,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root; int root_as_id; - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) @@ -1011,18 +1280,23 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { - if (!is_shadow_present_pte(iter.old_spte)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) + continue; + + if (!is_shadow_present_pte(iter.old_spte) || + iter.old_spte & shadow_dirty_mask) continue; new_spte = iter.old_spte | shadow_dirty_mask; tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; - - tdp_mmu_iter_cond_resched(kvm, &iter); } + rcu_read_unlock(); return spte_set; } @@ -1060,7 +1334,14 @@ static void zap_collapsible_spte_range(struct kvm *kvm, kvm_pfn_t pfn; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_pte(iter, root, start, end) { + if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { + spte_set = false; + continue; + } + if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; @@ -1072,9 +1353,10 @@ static void zap_collapsible_spte_range(struct kvm *kvm, tdp_mmu_set_spte(kvm, &iter, 0); - spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + spte_set = true; } + rcu_read_unlock(); if (spte_set) kvm_flush_remote_tlbs(kvm); } @@ -1111,6 +1393,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; + rcu_read_lock(); + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { if (!is_writable_pte(iter.old_spte)) break; @@ -1122,6 +1406,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, spte_set = true; } + rcu_read_unlock(); + return spte_set; } @@ -1137,7 +1423,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, int root_as_id; bool spte_set = false; - lockdep_assert_held(&kvm->mmu_lock); + lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) @@ -1162,10 +1448,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; + rcu_read_lock(); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } + rcu_read_unlock(); + return leaf; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index cbbdbadd1526..b4b65e3699b3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,10 +5,6 @@ #include <linux/kvm_host.h> -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); - -bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); @@ -47,4 +43,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +#ifdef CONFIG_X86_64 +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } +#else +static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +#endif + +static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + if (!is_tdp_mmu_enabled(kvm)) + return false; + if (WARN_ON(!VALID_PAGE(hpa))) + return false; + + sp = to_shadow_page(hpa); + if (WARN_ON(!sp)) + return false; + + return is_tdp_mmu_page(sp) && sp->root_count; +} + #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index f472fdb6ae7e..a8502e02f479 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* variable MTRRs */ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); - mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { /* MTRR base */ if (!valid_mtrr_type(data & 0xff)) @@ -351,14 +351,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (var_mtrr_range_is_valid(cur)) list_del(&mtrr_state->var_ranges[index].node); - /* Extend the mask with all 1 bits to the left, since those - * bits must implicitly be 0. The bits are then cleared - * when reading them. + /* + * Set all illegal GPA bits in the mask, since those bits must + * implicitly be 0. The bits are then cleared when reading them. */ if (!is_mtrr_mask) cur->base = data; else - cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu)); + cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); /* add it to the list if it's enabled. */ if (var_mtrr_range_is_valid(cur)) { @@ -426,7 +426,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; - *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1; + *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } return 0; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 67741d2a0308..827886c12c16 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -373,7 +373,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 1; if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && - (kvm_x86_ops.get_cpl(vcpu) != 0) && + (static_call(kvm_x86_get_cpl)(vcpu) != 0) && (kvm_read_cr0(vcpu) & X86_CR0_PE)) return 1; @@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { - if (lapic_in_kernel(vcpu)) + if (lapic_in_kernel(vcpu)) { + if (kvm_x86_ops.pmu_ops->deliver_pmi) + kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); + } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) @@ -473,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); } + if (kvm_x86_ops.pmu_ops->cleanup) + kvm_x86_ops.pmu_ops->cleanup(vcpu); + bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 067fef51760c..7b30bc967af3 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -39,6 +39,8 @@ struct kvm_pmu_ops { void (*refresh)(struct kvm_vcpu *vcpu); void (*init)(struct kvm_vcpu *vcpu); void (*reset)(struct kvm_vcpu *vcpu); + void (*deliver_pmi)(struct kvm_vcpu *vcpu); + void (*cleanup)(struct kvm_vcpu *vcpu); }; static inline u64 pmc_bitmask(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0ef84d57b72e..78bdcfac4e40 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -298,6 +298,23 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, + u32 icrl, u32 icrh) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, source, + icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } +} + int avic_incomplete_ipi_interception(struct vcpu_svm *svm) { u32 icrh = svm->vmcb->control.exit_info_1 >> 32; @@ -324,28 +341,14 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR2, icrh); kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; - case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { - int i; - struct kvm_vcpu *vcpu; - struct kvm *kvm = svm->vcpu.kvm; - struct kvm_lapic *apic = svm->vcpu.arch.apic; - + case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: /* * At this point, we expect that the AVIC HW has already * set the appropriate IRR bits on the valid target * vcpus. So, we just need to kick the appropriate vcpu. */ - kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, apic, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) - kvm_vcpu_wake_up(vcpu); - } + avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh); break; - } case AVIC_IPI_FAILURE_INVALID_TARGET: WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", index, svm->vcpu.vcpu_id, icrh, icrl); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index db30670dd8c4..cc91738ab445 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -58,7 +58,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, offset_in_page(cr3) + index * 8, 8); if (ret) return 0; @@ -248,7 +248,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) if (vmcb12_lma) { if (!(vmcb12->save.cr4 & X86_CR4_PAE) || !(vmcb12->save.cr0 & X86_CR0_PE) || - (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) return false; } if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) @@ -345,7 +345,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && @@ -392,7 +392,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.rsp = vmcb12->save.rsp; svm->vmcb->save.rip = vmcb12->save.rip; svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM; + svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; svm->vmcb->save.cpl = vmcb12->save.cpl; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 48017fef1cd9..874ea309279f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -22,6 +22,7 @@ #include "x86.h" #include "svm.h" +#include "svm_ops.h" #include "cpuid.h" #include "trace.h" @@ -1041,6 +1042,74 @@ e_unpin_memory: return ret; } +static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + void __user *report = (void __user *)(uintptr_t)argp->data; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_attestation_report *data; + struct kvm_sev_attestation_report params; + void __user *p; + void *blob = NULL; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + /* User wants to query the blob length */ + if (!params.len) + goto cmd; + + p = (void __user *)(uintptr_t)params.uaddr; + if (p) { + if (params.len > SEV_FW_BLOB_MAX_SIZE) { + ret = -EINVAL; + goto e_free; + } + + ret = -ENOMEM; + blob = kmalloc(params.len, GFP_KERNEL); + if (!blob) + goto e_free; + + data->address = __psp_pa(blob); + data->len = params.len; + memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce)); + } +cmd: + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error); + /* + * If we query the session length, FW responded with expected data. + */ + if (!params.len) + goto done; + + if (ret) + goto e_free_blob; + + if (blob) { + if (copy_to_user(p, blob, params.len)) + ret = -EFAULT; + } + +done: + params.len = data->len; + if (copy_to_user(report, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free_blob: + kfree(blob); +e_free: + kfree(data); + return ret; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1091,6 +1160,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_SECRET: r = sev_launch_secret(kvm, &sev_cmd); break; + case KVM_SEV_GET_ATTESTATION_REPORT: + r = sev_get_attestation_report(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -1994,29 +2066,17 @@ void sev_es_create_vcpu(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu) +void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) { struct svm_cpu_data *sd = per_cpu(svm_data, cpu); struct vmcb_save_area *hostsa; - unsigned int i; /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, * of which one step is to perform a VMLOAD. Since hardware does not * perform a VMSAVE on VMRUN, the host savearea must be updated. */ - asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); - - /* - * Certain MSRs are restored on VMEXIT, only save ones that aren't - * restored. - */ - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) { - if (host_save_user_msrs[i].sev_es_restored) - continue; - - rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]); - } + vmsave(__sme_page_pa(sd->save_area)); /* XCR0 is restored on VMEXIT, save the current host value */ hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); @@ -2029,22 +2089,6 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu) hostsa->xss = host_xss; } -void sev_es_vcpu_put(struct vcpu_svm *svm) -{ - unsigned int i; - - /* - * Certain MSRs are restored on VMEXIT and were saved with vmsave in - * sev_es_vcpu_load() above. Only restore ones that weren't. - */ - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) { - if (host_save_user_msrs[i].sev_es_restored) - continue; - - wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]); - } -} - void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { struct vcpu_svm *svm = to_svm(vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3442d44ca53b..adb3619a3c16 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -41,6 +41,7 @@ #include "trace.h" #include "svm.h" +#include "svm_ops.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -200,9 +201,9 @@ module_param(sev_es, int, 0444); bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); -static u8 rsm_ins_bytes[] = "\x0f\xaa"; +static bool svm_gp_erratum_intercept = true; -static void svm_complete_interrupts(struct vcpu_svm *svm); +static u8 rsm_ins_bytes[] = "\x0f\xaa"; static unsigned long iopm_base; @@ -246,21 +247,6 @@ u32 svm_msrpm_offset(u32 msr) #define MAX_INST_SIZE 15 -static inline void clgi(void) -{ - asm volatile (__ex("clgi")); -} - -static inline void stgi(void) -{ - asm volatile (__ex("stgi")); -} - -static inline void invlpga(unsigned long addr, u32 asid) -{ - asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); -} - static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 @@ -288,6 +274,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!(efer & EFER_SVME)) { svm_leave_nested(svm); svm_set_gif(svm, true); + /* #GP intercept is still needed for vmware backdoor */ + if (!enable_vmware_backdoor) + clr_exception_intercept(svm, GP_VECTOR); /* * Free the nested guest state, unless we are in SMM. @@ -304,6 +293,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.efer = old_efer; return ret; } + + if (svm_gp_erratum_intercept) + set_exception_intercept(svm, GP_VECTOR); } } @@ -925,6 +917,9 @@ static __init void svm_set_cpu_caps(void) if (npt_enabled) kvm_cpu_cap_set(X86_FEATURE_NPT); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } /* CPUID 0x80000008 */ @@ -1032,6 +1027,9 @@ static __init int svm_hardware_setup(void) } } + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + if (vgif) { if (!boot_cpu_has(X86_FEATURE_VGIF)) vgif = false; @@ -1207,7 +1205,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; - kvm_set_rflags(&svm->vcpu, 2); + kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1366,6 +1364,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->vmsa = page_address(vmsa_page); svm->asid_generation = 0; + svm->guest_state_loaded = false; init_vmcb(svm); svm_init_osvw(vcpu); @@ -1413,30 +1412,31 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); } -static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - int i; + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + unsigned int i; - if (unlikely(cpu != vcpu->cpu)) { - svm->asid_generation = 0; - vmcb_mark_all_dirty(svm->vmcb); - } + if (svm->guest_state_loaded) + return; + /* + * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save + * area (non-sev-es). Save ones that aren't so we can restore them + * individually later. + */ + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + /* + * Save additional host state that will be restored on VMEXIT (sev-es) + * or subsequent vmload of host save area. + */ if (sev_es_guest(svm->vcpu.kvm)) { - sev_es_vcpu_load(svm, cpu); + sev_es_prepare_guest_switch(svm, vcpu->cpu); } else { -#ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); -#endif - savesegment(fs, svm->host.fs); - savesegment(gs, svm->host.gs); - svm->host.ldt = kvm_read_ldt(); - - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - rdmsrl(host_save_user_msrs[i].index, - svm->host_user_msrs[i]); + vmsave(__sme_page_pa(sd->save_area)); } if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { @@ -1446,10 +1446,42 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); } } + /* This assumes that the kernel never uses MSR_TSC_AUX */ if (static_cpu_has(X86_FEATURE_RDTSCP)) wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + svm->guest_state_loaded = true; +} + +static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + unsigned int i; + + if (!svm->guest_state_loaded) + return; + + /* + * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save + * area (non-sev-es). Restore the ones that weren't. + */ + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + svm->guest_state_loaded = false; +} + +static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + + if (unlikely(cpu != vcpu->cpu)) { + svm->asid_generation = 0; + vmcb_mark_all_dirty(svm->vmcb); + } + if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); @@ -1459,30 +1491,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - int i; - avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; - if (sev_es_guest(svm->vcpu.kvm)) { - sev_es_vcpu_put(svm); - } else { - kvm_load_ldt(svm->host.ldt); -#ifdef CONFIG_X86_64 - loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); - load_gs_index(svm->host.gs); -#else -#ifdef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif - - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i].index, - svm->host_user_msrs[i]); - } } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -1815,7 +1827,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_exception_bitmap(struct kvm_vcpu *vcpu) +static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1865,7 +1877,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); /* - * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here, + * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, * because db_interception might need it. We can do it before vmentry. */ vcpu->arch.dr6 = svm->vmcb->save.dr6; @@ -1916,7 +1928,7 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { - u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1; + u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); return 1; } @@ -1962,24 +1974,6 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } -static int gp_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - u32 error_code = svm->vmcb->control.exit_info_1; - - WARN_ON_ONCE(!enable_vmware_backdoor); - - /* - * VMware backdoor emulation on #GP interception only handles IN{S}, - * OUT{S}, and RDPMC, none of which generate a non-zero error code. - */ - if (error_code) { - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; - } - return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); -} - static bool is_erratum_383(void) { int err, i; @@ -2178,6 +2172,102 @@ static int vmrun_interception(struct vcpu_svm *svm) return nested_svm_vmrun(svm); } +enum { + NONE_SVM_INSTR, + SVM_INSTR_VMRUN, + SVM_INSTR_VMLOAD, + SVM_INSTR_VMSAVE, +}; + +/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ +static int svm_instr_opcode(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + if (ctxt->b != 0x1 || ctxt->opcode_len != 2) + return NONE_SVM_INSTR; + + switch (ctxt->modrm) { + case 0xd8: /* VMRUN */ + return SVM_INSTR_VMRUN; + case 0xda: /* VMLOAD */ + return SVM_INSTR_VMLOAD; + case 0xdb: /* VMSAVE */ + return SVM_INSTR_VMSAVE; + default: + break; + } + + return NONE_SVM_INSTR; +} + +static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) +{ + const int guest_mode_exit_codes[] = { + [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, + [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, + [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, + }; + int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = { + [SVM_INSTR_VMRUN] = vmrun_interception, + [SVM_INSTR_VMLOAD] = vmload_interception, + [SVM_INSTR_VMSAVE] = vmsave_interception, + }; + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu)) { + svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode]; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + return nested_svm_vmexit(svm); + } else + return svm_instr_handlers[opcode](svm); +} + +/* + * #GP handling code. Note that #GP can be triggered under the following two + * cases: + * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on + * some AMD CPUs when EAX of these instructions are in the reserved memory + * regions (e.g. SMM memory on host). + * 2) VMware backdoor + */ +static int gp_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 error_code = svm->vmcb->control.exit_info_1; + int opcode; + + /* Both #GP cases have zero error_code */ + if (error_code) + goto reinject; + + /* Decode the instruction for usage later */ + if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) + goto reinject; + + opcode = svm_instr_opcode(vcpu); + + if (opcode == NONE_SVM_INSTR) { + if (!enable_vmware_backdoor) + goto reinject; + + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC. + */ + if (!is_guest_mode(vcpu)) + return kvm_emulate_instruction(vcpu, + EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); + } else + return emulate_svm_instr(vcpu, opcode); + +reinject: + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; +} + void svm_set_gif(struct vcpu_svm *svm, bool value) { if (value) { @@ -2265,11 +2355,8 @@ static int xsetbv_interception(struct vcpu_svm *svm) u64 new_bv = kvm_read_edx_eax(&svm->vcpu); u32 index = kvm_rcx_read(&svm->vcpu); - if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - return kvm_skip_emulated_instruction(&svm->vcpu); - } - - return 1; + int err = kvm_set_xcr(&svm->vcpu, index, new_bv); + return kvm_complete_insn_gp(&svm->vcpu, err); } static int rdpru_interception(struct vcpu_svm *svm) @@ -2530,6 +2617,7 @@ static int dr_interception(struct vcpu_svm *svm) { int reg, dr; unsigned long val; + int err = 0; if (svm->vcpu.guest_debug == 0) { /* @@ -2547,20 +2635,16 @@ static int dr_interception(struct vcpu_svm *svm) reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; - - if (dr >= 16) { /* mov to DRn */ - if (!kvm_require_dr(&svm->vcpu, dr - 16)) - return 1; + if (dr >= 16) { /* mov to DRn */ + dr -= 16; val = kvm_register_read(&svm->vcpu, reg); - kvm_set_dr(&svm->vcpu, dr - 16, val); + err = kvm_set_dr(&svm->vcpu, dr, val); } else { - if (!kvm_require_dr(&svm->vcpu, dr)) - return 1; kvm_get_dr(&svm->vcpu, dr, &val); kvm_register_write(&svm->vcpu, reg, val); } - return kvm_skip_emulated_instruction(&svm->vcpu); + return kvm_complete_insn_gp(&svm->vcpu, err); } static int cr8_write_interception(struct vcpu_svm *svm) @@ -3354,7 +3438,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3479,7 +3563,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !svm_interrupt_blocked(vcpu); } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static void svm_enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3503,7 +3587,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) } } -static void enable_nmi_window(struct kvm_vcpu *vcpu) +static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3560,10 +3644,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) invlpga(gva, svm->vmcb->control.asid); } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) -{ -} - static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3708,16 +3788,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, if (sev_es_guest(svm->vcpu.kvm)) { __svm_sev_es_vcpu_run(svm->vmcb_pa); } else { + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); -#ifdef CONFIG_X86_64 - native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + vmload(__sme_page_pa(sd->save_area)); } /* @@ -3783,7 +3858,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) svm_set_dr6(svm, vcpu->arch.dr6); else - svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM); + svm_set_dr6(svm, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -3978,7 +4053,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) { best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); if (best) - vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f)); + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } if (!kvm_vcpu_apicv_active(vcpu)) @@ -4285,7 +4360,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return ret; } -static void enable_smi_window(struct kvm_vcpu *vcpu) +static void svm_enable_smi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4439,7 +4514,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_blocking = svm_vcpu_blocking, .vcpu_unblocking = svm_vcpu_unblocking, - .update_exception_bitmap = update_exception_bitmap, + .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, .get_msr = svm_get_msr, .set_msr = svm_set_msr, @@ -4482,9 +4557,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, .set_nmi_mask = svm_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, + .enable_nmi_window = svm_enable_nmi_window, + .enable_irq_window = svm_enable_irq_window, + .update_cr8_intercept = svm_update_cr8_intercept, .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, @@ -4527,7 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .smi_allowed = svm_smi_allowed, .pre_enter_smm = svm_pre_enter_smm, .pre_leave_smm = svm_pre_leave_smm, - .enable_smi_window = enable_smi_window, + .enable_smi_window = svm_enable_smi_window, .mem_enc_op = svm_mem_enc_op, .mem_enc_reg_region = svm_register_enc_region, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6e7d070f8b86..39e071fdab0c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -23,22 +23,8 @@ #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) -static const struct svm_host_save_msrs { - u32 index; /* Index of the MSR */ - bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */ -} host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - { .index = MSR_STAR, .sev_es_restored = true }, - { .index = MSR_LSTAR, .sev_es_restored = true }, - { .index = MSR_CSTAR, .sev_es_restored = true }, - { .index = MSR_SYSCALL_MASK, .sev_es_restored = true }, - { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true }, - { .index = MSR_FS_BASE, .sev_es_restored = true }, -#endif - { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true }, - { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true }, - { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true }, - { .index = MSR_TSC_AUX, .sev_es_restored = false }, +static const u32 host_save_user_msrs[] = { + MSR_TSC_AUX, }; #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) @@ -130,12 +116,6 @@ struct vcpu_svm { u64 next_rip; u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; u64 spec_ctrl; /* @@ -192,6 +172,8 @@ struct vcpu_svm { u64 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; + + bool guest_state_loaded; }; struct svm_cpu_data { @@ -587,9 +569,8 @@ int sev_handle_vmgexit(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm); -void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu); -void sev_es_vcpu_put(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); +void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); /* vmenter.S */ diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h new file mode 100644 index 000000000000..8170f2a5a16f --- /dev/null +++ b/arch/x86/kvm/svm/svm_ops.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SVM_OPS_H +#define __KVM_X86_SVM_OPS_H + +#include <linux/compiler_types.h> + +#include <asm/kvm_host.h> + +#define svm_asm(insn, clobber...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) "\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + ::: clobber : fault); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define svm_asm1(insn, op1, clobber...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + :: op1 : clobber : fault); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define svm_asm2(insn, op1, op2, clobber...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + :: op1, op2 : clobber : fault); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +static inline void clgi(void) +{ + svm_asm(clgi); +} + +static inline void stgi(void) +{ + svm_asm(stgi); +} + +static inline void invlpga(unsigned long addr, u32 asid) +{ + svm_asm2(invlpga, "c"(asid), "a"(addr)); +} + +/* + * Despite being a physical address, the portion of rAX that is consumed by + * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, + * hence 'unsigned long' instead of 'hpa_t'. + */ +static inline void vmsave(unsigned long pa) +{ + svm_asm1(vmsave, "a" (pa), "memory"); +} + +static inline void vmload(unsigned long pa) +{ + svm_asm1(vmload, "a" (pa), "memory"); +} + +#endif /* __KVM_X86_SVM_OPS_H */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 2de30c20bc26..a61c015870e3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -93,6 +93,42 @@ TRACE_EVENT(kvm_hv_hypercall, ); /* + * Tracepoint for Xen hypercall. + */ +TRACE_EVENT(kvm_xen_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, unsigned long a4, + unsigned long a5), + TP_ARGS(nr, a0, a1, a2, a3, a4, a5), + + TP_STRUCT__entry( + __field(unsigned long, nr) + __field(unsigned long, a0) + __field(unsigned long, a1) + __field(unsigned long, a2) + __field(unsigned long, a3) + __field(unsigned long, a4) + __field(unsigned long, a5) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + __entry->a4 = a4; + __entry->a4 = a5; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3, __entry->a4, __entry->a5) +); + + + +/* * Tracepoint for PIO. */ @@ -256,7 +292,7 @@ TRACE_EVENT(name, \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \ + static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \ &__entry->info2, \ &__entry->intr_info, \ &__entry->error_code); \ @@ -738,7 +774,7 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS); + __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS); __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr - vcpu->arch.emulate_ctxt->fetch.data; __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 3a1861403d73..d1d77985e889 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -19,6 +19,9 @@ extern int __read_mostly pt_mode; #define PT_MODE_HOST_GUEST 1 #define PMU_CAP_FW_WRITES (1ULL << 13) +#define PMU_CAP_LBR_FMT 0x3f + +#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) struct nested_vmx_msrs { /* @@ -262,6 +265,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void) SECONDARY_EXEC_TSC_SCALING; } +static inline bool cpu_has_vmx_bus_lock_detection(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_BUS_LOCK_DETECTION; +} + static inline bool cpu_has_vmx_apicv(void) { return cpu_has_vmx_apic_register_virt() && @@ -371,11 +380,28 @@ static inline bool vmx_pt_mode_is_host_guest(void) static inline u64 vmx_get_perf_capabilities(void) { + u64 perf_cap = 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + + perf_cap &= PMU_CAP_LBR_FMT; + /* * Since counters are virtualized, KVM would support full * width counting unconditionally, even if the host lacks it. */ - return PMU_CAP_FW_WRITES; + return PMU_CAP_FW_WRITES | perf_cap; +} + +static inline u64 vmx_supported_debugctl(void) +{ + u64 debugctl = 0; + + if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) + debugctl |= DEBUGCTLMSR_LBR_MASK; + + return debugctl; } #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f2b9bfb58206..b2f0b5e9cd63 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -12,6 +12,7 @@ #include "nested.h" #include "pmu.h" #include "trace.h" +#include "vmx.h" #include "x86.h" static bool __read_mostly enable_shadow_vmcs = 1; @@ -411,8 +412,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit if (nr == DB_VECTOR) { if (!has_payload) { payload = vcpu->arch.dr6; - payload &= ~(DR6_FIXED_1 | DR6_BT); - payload ^= DR6_RTM; + payload &= ~DR6_BT; + payload ^= DR6_ACTIVE_LOW; } *exit_qual = payload; } else @@ -744,8 +745,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, (CC(!nested_cpu_has_vid(vmcs12)) || CC(!nested_exit_intr_ack_set(vcpu)) || CC((vmcs12->posted_intr_nv & 0xff00)) || - CC((vmcs12->posted_intr_desc_addr & 0x3f)) || - CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) + CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -758,13 +758,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { - int maxphyaddr; - if (count == 0) return 0; - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) + + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || + !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; return 0; @@ -1062,14 +1060,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, } } -static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long invalid_mask; - - invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - return (val & invalid_mask) == 0; -} - /* * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't @@ -1121,7 +1111,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, enum vm_entry_failure_code *entry_failure_code) { - if (CC(!nested_cr3_valid(vcpu, cr3))) { + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2532,7 +2522,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. */ - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); @@ -2635,7 +2625,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ switch (new_eptp & VMX_EPTP_MT_MASK) { @@ -2666,7 +2655,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ @@ -2850,7 +2839,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) + CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || @@ -3057,35 +3046,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - asm( - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" - "je 1f \n\t" - __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" - "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" - "1: \n\t" - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ - - /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" - - /* - * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set - * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail - * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the - * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. - */ - "call vmx_vmenter\n\t" - - CC_SET(be) - : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : [HOST_RSP]"r"((unsigned long)HOST_RSP), - [loaded_vmcs]"r"(vmx->loaded_vmcs), - [launched]"i"(offsetof(struct loaded_vmcs, launched)), - [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), - [wordsize]"i"(sizeof(ulong)) - : "memory" - ); + vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); if (vmx->msr_autoload.host.nr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -3330,7 +3292,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; bool evaluate_pending_interrupts; - u32 exit_reason, failed_index; + union vmx_exit_reason exit_reason = { + .basic = EXIT_REASON_INVALID_STATE, + .failed_vmentry = 1, + }; + u32 failed_index; if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); @@ -3382,7 +3348,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { - exit_reason = EXIT_REASON_INVALID_STATE; + exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; } @@ -3393,7 +3359,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { - exit_reason = EXIT_REASON_INVALID_STATE; + exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; } @@ -3403,7 +3369,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); if (failed_index) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; } @@ -3471,7 +3437,7 @@ vmentry_fail_vmexit: return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); - vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; + vmcs12->vm_exit_reason = exit_reason.full; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; @@ -5559,7 +5525,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); fail: - nested_vmx_vmexit(vcpu, vmx->exit_reason, + /* + * This is effectively a reflected VM-Exit, as opposed to a synthesized + * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode + * EXIT_REASON_VMFUNC as the exit reason. + */ + nested_vmx_vmexit(vcpu, vmx->exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -5627,7 +5598,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. */ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 exit_reason) + struct vmcs12 *vmcs12, + union vmx_exit_reason exit_reason) { u32 msr_index = kvm_rcx_read(vcpu); gpa_t bitmap; @@ -5641,7 +5613,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; - if (exit_reason == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; @@ -5778,11 +5750,12 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) * Return true if L0 wants to handle an exit from L2 regardless of whether or not * L1 wants the exit. Only call this when in is_guest_mode (L2). */ -static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, + union vmx_exit_reason exit_reason) { u32 intr_info; - switch ((u16)exit_reason) { + switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) @@ -5838,12 +5811,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) * Return 1 if L1 wants to intercept an exit from L2. Only call this when in * is_guest_mode (L2). */ -static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, + union vmx_exit_reason exit_reason) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 intr_info; - switch ((u16)exit_reason) { + switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) @@ -5962,7 +5936,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->exit_reason; unsigned long exit_qual; u32 exit_intr_info; @@ -5981,7 +5955,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); + trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -6007,7 +5981,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: - nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); + nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); return true; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index cdf5f34518f4..d1df618cb7de 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } -static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) { if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return false; + return 0; - return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES; + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; } static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) @@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } +bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + /* + * As a first step, a guest could only enable LBR feature if its + * cpu model is the same as the host because the LBR registers + * would be pass-through to the guest and they're model specific. + */ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + + return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT); +} + +static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) +{ + struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); + bool ret = false; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return ret; + + ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) || + (index >= records->from && index < records->from + records->nr) || + (index >= records->to && index < records->to + records->nr); + + if (!ret && records->info) + ret = (index >= records->info && index < records->info + records->nr); + + return ret; +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr); + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || + intel_pmu_is_valid_lbr_msr(vcpu, msr); break; } @@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->event) { + perf_event_release_kernel(lbr_desc->event); + lbr_desc->event = NULL; + vcpu_to_pmu(vcpu)->event_count--; + } +} + +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct perf_event *event; + + /* + * The perf_event_attr is constructed in the minimum efficient way: + * - set 'pinned = true' to make it task pinned so that if another + * cpu pinned event reclaims LBR, the event->oncpu will be set to -1; + * - set '.exclude_host = true' to record guest branches behavior; + * + * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf + * schedule the event without a real HW counter but a fake one; + * check is_guest_lbr_event() and __intel_get_event_constraints(); + * + * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and + * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack + * event, which helps KVM to save/restore guest LBR records + * during host context switches and reduces quite a lot overhead, + * check branch_user_callstack() and intel_pmu_lbr_sched_task(); + */ + struct perf_event_attr attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(attr), + .config = INTEL_FIXED_VLBR_EVENT, + .sample_type = PERF_SAMPLE_BRANCH_STACK, + .pinned = true, + .exclude_host = true, + .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_USER, + }; + + if (unlikely(lbr_desc->event)) { + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + return 0; + } + + event = perf_event_create_kernel_counter(&attr, -1, + current, NULL, NULL); + if (IS_ERR(event)) { + pr_debug_ratelimited("%s: failed %ld\n", + __func__, PTR_ERR(event)); + return -ENOENT; + } + lbr_desc->event = event; + pmu->event_count++; + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + return 0; +} + +/* + * It's safe to access LBR msrs from guest when they have not + * been passthrough since the host would help restore or reset + * the LBR msrs records when the guest LBR event is scheduled in. + */ +static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, bool read) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + u32 index = msr_info->index; + + if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) + return false; + + if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu)) + goto dummy; + + /* + * Disable irq to ensure the LBR feature doesn't get reclaimed by the + * host at the time the value is read from the msr, and this avoids the + * host LBR value to be leaked to the guest. If LBR has been reclaimed, + * return 0 on guest reads. + */ + local_irq_disable(); + if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { + if (read) + rdmsrl(index, msr_info->data); + else + wrmsrl(index, msr_info->data); + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); + local_irq_enable(); + return true; + } + clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); + local_irq_enable(); + +dummy: + if (read) + msr_info->data = 0; + return true; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; return 0; - } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) + return 0; } return 1; @@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reprogram_gp_counter(pmc, data); return 0; } - } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) + return 0; } return 1; @@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; @@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; - vcpu->arch.perf_capabilities = 0; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -340,8 +489,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; perf_get_x86_pmu_capability(&x86_pmu); - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); @@ -385,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); nested_vmx_pmu_entry_exit_ctls_update(vcpu); + + if (intel_pmu_lbr_is_compatible(vcpu)) + x86_perf_get_lbr(&lbr_desc->records); + else + lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); } static void intel_pmu_init(struct kvm_vcpu *vcpu) { int i; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; @@ -405,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; pmu->fixed_counters[i].current_config = 0; } + + vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); + lbr_desc->records.nr = 0; + lbr_desc->event = NULL; + lbr_desc->msr_passthrough = false; } static void intel_pmu_reset(struct kvm_vcpu *vcpu) @@ -429,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = pmu->global_ovf_ctrl = 0; + + intel_pmu_release_guest_lbr_event(vcpu); +} + +/* + * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4. + * + * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and + * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL. + * + * Guest needs to re-enable LBR to resume branches recording. + */ +static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) +{ + u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + } +} + +static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu) +{ + u8 version = vcpu_to_pmu(vcpu)->version; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return; + + if (version > 1 && version < 4) + intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu); +} + +static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + int i; + + for (i = 0; i < lbr->nr; i++) { + vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set); + if (lbr->info) + vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set); + } + + vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set); +} + +static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->msr_passthrough) + return; + + vmx_update_intercept_for_lbr_msrs(vcpu, true); + lbr_desc->msr_passthrough = false; +} + +static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->msr_passthrough) + return; + + vmx_update_intercept_for_lbr_msrs(vcpu, false); + lbr_desc->msr_passthrough = true; +} + +/* + * Higher priority host perf events (e.g. cpu pinned) could reclaim the + * pmu resources (e.g. LBR) that were assigned to the guest. This is + * usually done via ipi calls (more details in perf_install_in_context). + * + * Before entering the non-root mode (with irq disabled here), double + * confirm that the pmu features enabled to the guest are not reclaimed + * by higher priority host events. Otherwise, disallow vcpu's access to + * the reclaimed features. + */ +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); + if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; + return; + } + + if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) { + vmx_disable_lbr_msrs_passthrough(vcpu); + __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + goto warn; + } else + vmx_enable_lbr_msrs_passthrough(vcpu); + + return; + +warn: + pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", + vcpu->vcpu_id); +} + +static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); } struct kvm_pmu_ops intel_pmu_ops = { @@ -445,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = { .refresh = intel_pmu_refresh, .init = intel_pmu_init, .reset = intel_pmu_reset, + .deliver_pmi = intel_pmu_deliver_pmi, + .cleanup = intel_pmu_cleanup, }; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index f02962dcc72c..4831bc44ce66 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) dest = cpu_physical_id(cpu); - if (x2apic_enabled()) + if (x2apic_mode) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -104,7 +104,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) dest = cpu_physical_id(vcpu->cpu); - if (x2apic_enabled()) + if (x2apic_mode) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -174,7 +174,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) */ dest = cpu_physical_id(vcpu->pre_pcpu); - if (x2apic_enabled()) + if (x2apic_mode) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e85aa5faa22d..3a6461694fc2 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -44,7 +44,7 @@ * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump * to vmx_vmexit. */ -SYM_FUNC_START(vmx_vmenter) +SYM_FUNC_START_LOCAL(vmx_vmenter) /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ je 2f diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index eb69fef57485..e0a3a9be654b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -50,6 +50,7 @@ #include "capabilities.h" #include "cpuid.h" #include "evmcs.h" +#include "hyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" @@ -552,7 +553,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; struct hv_partition_assist_pg **p_hv_pa_pg = - &vcpu->kvm->arch.hyperv.hv_pa_pg; + &to_kvm_hv(vcpu->kvm)->hv_pa_pg; /* * Synthetic VM-Exit is not enabled in current code and so All * evmcs in singe VM shares same assist page. @@ -658,6 +659,14 @@ static bool is_valid_passthrough_msr(u32 msr) case MSR_IA32_RTIT_CR3_MATCH: case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + case MSR_LBR_SELECT: + case MSR_LBR_TOS: + case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: + case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: + case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: + case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: + case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: + /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ return true; } @@ -806,7 +815,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) return *p; } -void update_exception_bitmap(struct kvm_vcpu *vcpu) +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -1102,7 +1111,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ - return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); + return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) @@ -1577,7 +1586,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { orig_rip = kvm_rip_read(vcpu); rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); #ifdef CONFIG_X86_64 @@ -1924,6 +1933,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; goto find_uret_msr; + case MSR_IA32_DEBUGCTLMSR: + msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_info->index); @@ -1947,6 +1959,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } +static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) +{ + u64 debugctl = vmx_supported_debugctl(); + + if (!intel_pmu_lbr_is_enabled(vcpu)) + debugctl &= ~DEBUGCTLMSR_LBR_MASK; + + return debugctl; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -1997,14 +2019,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_DEBUGCTLMSR: { + u64 invalid = data & ~vcpu_supported_debugctl(vcpu); + if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + } + + if (invalid) + return 1; + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; - ret = kvm_set_msr_common(vcpu, msr_info); - break; - + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); + return 0; + } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -2196,6 +2233,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data >> 32) != 0) return 1; goto find_uret_msr; + case MSR_IA32_PERF_CAPABILITIES: + if (data && !vcpu_to_pmu(vcpu)->version) + return 1; + if (data & PMU_CAP_LBR_FMT) { + if ((data & PMU_CAP_LBR_FMT) != + (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + return 1; + if (!intel_pmu_lbr_is_compatible(vcpu)) + return 1; + } + ret = kvm_set_msr_common(vcpu, msr_info); + break; default: find_uret_msr: @@ -2265,7 +2314,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) u64 msr; cr4_set_bits(X86_CR4_VMXE); - intel_pt_handle_vmx(1); asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) @@ -2276,7 +2324,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - intel_pt_handle_vmx(0); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; @@ -2299,9 +2346,13 @@ static int hardware_enable(void) !hv_get_vp_assist_page(cpu)) return -EFAULT; + intel_pt_handle_vmx(1); + r = kvm_cpu_vmxon(phys_addr); - if (r) + if (r) { + intel_pt_handle_vmx(0); return r; + } if (enable_ept) ept_sync_global(); @@ -2319,22 +2370,14 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } - -/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() - * tricks. - */ -static void kvm_cpu_vmxoff(void) -{ - asm volatile (__ex("vmxoff")); - - intel_pt_handle_vmx(0); - cr4_clear_bits(X86_CR4_VMXE); -} - static void hardware_disable(void) { vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + intel_pt_handle_vmx(0); } /* @@ -2428,7 +2471,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC; + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_BUS_LOCK_DETECTION; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2739,7 +2783,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); @@ -2819,7 +2863,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); @@ -3774,7 +3818,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { if (value) @@ -4269,6 +4313,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, ENABLE_USR_WAIT_PAUSE, false); + if (!vcpu->kvm->arch.bus_lock_detection_enabled) + exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + vmx->secondary_exec_control = exec_control; } @@ -4467,23 +4514,23 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); if (init_event) vmx_clear_hlt(vcpu); } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -static void enable_nmi_window(struct kvm_vcpu *vcpu) +static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); + vmx_enable_irq_window(vcpu); return; } @@ -4824,7 +4871,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); fallthrough; case BP_VECTOR: @@ -5049,6 +5096,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int dr, dr7, reg; + int err = 1; exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; @@ -5057,9 +5105,9 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!kvm_require_dr(vcpu, dr)) return 1; - /* Do not handle if the CPL > 0, will trigger GP on re-entry */ - if (!kvm_require_cpl(vcpu, 0)) - return 1; + if (kvm_x86_ops.get_cpl(vcpu) > 0) + goto out; + dr7 = vmcs_readl(GUEST_DR7); if (dr7 & DR7_GD) { /* @@ -5068,7 +5116,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1; + vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; @@ -5096,14 +5144,15 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; - if (kvm_get_dr(vcpu, dr, &val)) - return 1; + kvm_get_dr(vcpu, dr, &val); kvm_register_write(vcpu, reg, val); - } else - if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) - return 1; + err = 0; + } else { + err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)); + } - return kvm_skip_emulated_instruction(vcpu); +out: + return kvm_complete_insn_gp(vcpu, err); } static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) @@ -5177,9 +5226,8 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u64 new_bv = kvm_read_edx_eax(vcpu); u32 index = kvm_rcx_read(vcpu); - if (kvm_set_xcr(vcpu, index, new_bv) == 0) - return kvm_skip_emulated_instruction(vcpu); - return 1; + int err = kvm_set_xcr(vcpu, index, new_bv); + return kvm_complete_insn_gp(vcpu, err); } static int handle_apic_access(struct kvm_vcpu *vcpu) @@ -5600,6 +5648,13 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } +static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + return 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5656,6 +5711,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, + [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -5667,7 +5723,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, struct vcpu_vmx *vmx = to_vmx(vcpu); *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + if (!(vmx->exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -5908,11 +5964,12 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + u16 exit_handler_index; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more @@ -5954,11 +6011,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 1; } - if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + if (exit_reason.failed_vmentry) { dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason - = exit_reason; + = exit_reason.full; vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -5980,18 +6037,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * will cause infinite loop. */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_PML_FULL && - exit_reason != EXIT_REASON_APIC_ACCESS && - exit_reason != EXIT_REASON_TASK_SWITCH)) { + (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && + exit_reason.basic != EXIT_REASON_EPT_VIOLATION && + exit_reason.basic != EXIT_REASON_PML_FULL && + exit_reason.basic != EXIT_REASON_APIC_ACCESS && + exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[1] = exit_reason.full; vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; - if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { vcpu->run->internal.ndata++; vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -6023,42 +6080,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - if (exit_reason >= kvm_vmx_max_exit_handlers) + if (exit_reason.basic >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; #ifdef CONFIG_RETPOLINE - if (exit_reason == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); - else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) + else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) return handle_interrupt_window(vcpu); - else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) return handle_external_interrupt(vcpu); - else if (exit_reason == EXIT_REASON_HLT) + else if (exit_reason.basic == EXIT_REASON_HLT) return kvm_emulate_halt(vcpu); - else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) return handle_ept_misconfig(vcpu); #endif - exit_reason = array_index_nospec(exit_reason, - kvm_vmx_max_exit_handlers); - if (!kvm_vmx_exit_handlers[exit_reason]) + exit_handler_index = array_index_nospec((u16)exit_reason.basic, + kvm_vmx_max_exit_handlers); + if (!kvm_vmx_exit_handlers[exit_handler_index]) goto unexpected_vmexit; - return kvm_vmx_exit_handlers[exit_reason](vcpu); + return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason.full); dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[0] = exit_reason.full; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } +static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +{ + int ret = __vmx_handle_exit(vcpu, exit_fastpath); + + /* + * Even when current exit reason is handled by KVM internally, we + * still need to exit to user space when bus lock detected to inform + * that there is a bus lock in guest. + */ + if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (ret > 0) + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + return 0; + } + return ret; +} + /* * Software based L1D cache flush which is used when microcode providing * the cache control MSR is not loaded. @@ -6129,7 +6206,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; @@ -6373,9 +6450,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); - else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) + else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); } @@ -6567,7 +6644,7 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - switch (to_vmx(vcpu)->exit_reason) { + switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -6577,8 +6654,6 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } } -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); - static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { @@ -6638,11 +6713,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; -reenter_guest: /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) @@ -6696,6 +6769,8 @@ reenter_guest: pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); + if (intel_pmu_lbr_is_enabled(vcpu)) + vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); @@ -6734,12 +6809,12 @@ reenter_guest: x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); /* All fields are clean at this point */ - if (static_branch_unlikely(&enable_evmcs)) + if (static_branch_unlikely(&enable_evmcs)) { current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - if (static_branch_unlikely(&enable_evmcs)) - current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index; + current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); + } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) @@ -6768,17 +6843,17 @@ reenter_guest: vmx->idt_vectoring_info = 0; if (unlikely(vmx->fail)) { - vmx->exit_reason = 0xdead; + vmx->exit_reason.full = 0xdead; return EXIT_FASTPATH_NONE; } - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)) + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -6790,22 +6865,7 @@ reenter_guest: if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - exit_fastpath = vmx_exit_handlers_fastpath(vcpu); - if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) { - if (!kvm_vcpu_exit_request(vcpu)) { - /* - * FIXME: this goto should be a loop in vcpu_enter_guest, - * but it would incur the cost of a retpoline for now. - * Revisit once static calls are available. - */ - if (vcpu->arch.apicv_active) - vmx_sync_pir_to_irr(vcpu); - goto reenter_guest; - } - exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; - } - - return exit_fastpath; + return vmx_exit_handlers_fastpath(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -7256,7 +7316,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_cr4_guest_host_mask(vmx); /* Refresh #PF interception to account for MAXPHYADDR changes. */ - update_exception_bitmap(vcpu); + vmx_update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) @@ -7546,7 +7606,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 0; } -static void enable_smi_window(struct kvm_vcpu *vcpu) +static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } @@ -7606,7 +7666,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_exception_bitmap = update_exception_bitmap, + .update_exception_bitmap = vmx_update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, @@ -7649,9 +7709,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, + .enable_nmi_window = vmx_enable_nmi_window, + .enable_irq_window = vmx_enable_irq_window, + .update_cr8_intercept = vmx_update_cr8_intercept, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, @@ -7709,7 +7769,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, - .enable_smi_window = enable_smi_window, + .enable_smi_window = vmx_enable_smi_window, .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, @@ -7810,6 +7870,8 @@ static __init int hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 48; } + kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9d3a557949ac..12c53d05a902 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -70,6 +70,54 @@ struct pt_desc { struct pt_ctx guest; }; +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) +#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) + +bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); + +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -244,7 +292,7 @@ struct vcpu_vmx { int vpid; bool emulation_required; - u32 exit_reason; + union vmx_exit_reason exit_reason; /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -279,6 +327,7 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ #define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 @@ -329,7 +378,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level); -void update_exception_bitmap(struct kvm_vcpu *vcpu); +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); @@ -339,8 +388,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value); static inline u8 vmx_get_rvi(void) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b967c1c774a1..884e5b3838c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -29,6 +29,7 @@ #include "pmu.h" #include "hyperv.h" #include "lapic.h" +#include "xen.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -114,11 +115,21 @@ static int sync_regs(struct kvm_vcpu *vcpu); struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); +#define KVM_X86_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ + *(((struct kvm_x86_ops *)0)->func)); +#define KVM_X86_OP_NULL KVM_X86_OP +#include <asm/kvm-x86-ops.h> +EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); +EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); +EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); + static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); -static bool __read_mostly report_ignored_msrs = true; +bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -136,6 +147,8 @@ u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); u64 __read_mostly kvm_default_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); +bool __read_mostly kvm_has_bus_lock_exit; +EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; @@ -234,7 +247,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), - VM_STAT("mmu_pte_updated", mmu_pte_updated), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), VM_STAT("mmu_flooded", mmu_flooded), VM_STAT("mmu_recycled", mmu_recycled), @@ -395,7 +407,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) @@ -484,19 +496,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) */ vcpu->arch.dr6 &= ~DR_TRAP_BITS; /* - * DR6.RTM is set by all #DB exceptions that don't clear it. + * In order to reflect the #DB exception payload in guest + * dr6, three components need to be considered: active low + * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, + * DR6_BS and DR6_BT) + * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. + * In the target guest dr6: + * FIXED_1 bits should always be set. + * Active low bits should be cleared if 1-setting in payload. + * Active high bits should be set if 1-setting in payload. + * + * Note, the payload is compatible with the pending debug + * exceptions/exit qualification under VMX, that active_low bits + * are active high in payload. + * So they need to be flipped for DR6. */ - vcpu->arch.dr6 |= DR6_RTM; + vcpu->arch.dr6 |= DR6_ACTIVE_LOW; vcpu->arch.dr6 |= payload; - /* - * Bit 16 should be set in the payload whenever the #DB - * exception should clear DR6.RTM. This makes the payload - * compatible with the pending debug exceptions under VMX. - * Though not currently documented in the SDM, this also - * makes the payload compatible with the exit qualification - * for #DB exceptions under VMX. - */ - vcpu->arch.dr6 ^= payload & DR6_RTM; + vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending @@ -692,7 +709,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) + if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -742,8 +759,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { - return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | - rsvd_bits(1, 2); + return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* @@ -852,7 +868,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } @@ -865,7 +881,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; - kvm_x86_ops.set_cr0(vcpu, cr0); + static_call(kvm_x86_set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); @@ -970,12 +986,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (kvm_x86_ops.get_cpl(vcpu) != 0 || - __kvm_set_xcr(vcpu, index, xcr)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; + if (static_call(kvm_x86_get_cpl)(vcpu) == 0) + return __kvm_set_xcr(vcpu, index, xcr); + + return 1; } EXPORT_SYMBOL_GPL(kvm_set_xcr); @@ -987,7 +1001,7 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return kvm_x86_ops.is_valid_cr4(vcpu, cr4); + return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); @@ -1031,7 +1045,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - kvm_x86_ops.set_cr4(vcpu, cr4); + static_call(kvm_x86_set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); @@ -1059,8 +1073,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && - (cr3 & vcpu->arch.cr3_lm_rsvd_bits)) + if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; else if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) @@ -1114,7 +1127,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops.set_dr7(vcpu, dr7); + static_call(kvm_x86_set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1130,7 +1143,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) return fixed; } -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { size_t size = ARRAY_SIZE(vcpu->arch.db); @@ -1143,13 +1156,13 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 4: case 6: if (!kvm_dr6_valid(val)) - return -1; /* #GP */ + return 1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; case 5: default: /* 7 */ if (!kvm_dr7_valid(val)) - return -1; /* #GP */ + return 1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); break; @@ -1157,18 +1170,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } - -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - if (__kvm_set_dr(vcpu, dr, val)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; -} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { size_t size = ARRAY_SIZE(vcpu->arch.db); @@ -1185,7 +1189,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) *val = vcpu->arch.dr7; break; } - return 0; } EXPORT_SYMBOL_GPL(kvm_get_dr); @@ -1426,7 +1429,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - return kvm_x86_ops.get_msr_feature(msr); + return static_call(kvm_x86_get_msr_feature)(msr); } return 0; } @@ -1502,7 +1505,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - r = kvm_x86_ops.set_efer(vcpu, efer); + r = static_call(kvm_x86_set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; @@ -1599,7 +1602,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return kvm_x86_ops.set_msr(vcpu, &msr); + return static_call(kvm_x86_set_msr)(vcpu, &msr); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, @@ -1632,7 +1635,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = kvm_x86_ops.get_msr(vcpu, &msr); + ret = static_call(kvm_x86_get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -1673,12 +1676,12 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } - return kvm_x86_ops.complete_emulated_msr(vcpu, err); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); } static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); } static u64 kvm_msr_reason(int r) @@ -1750,7 +1753,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_read_ex(ecx); } - return kvm_x86_ops.complete_emulated_msr(vcpu, r); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); @@ -1776,17 +1779,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) else trace_kvm_msr_write_ex(ecx, data); - return kvm_x86_ops.complete_emulated_msr(vcpu, r); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); -bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); /* * The fast path for frequent and performance sensitive wrmsr emulation, @@ -1936,15 +1938,14 @@ static s64 get_kvmclock_base_ns(void) } #endif -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; struct pvclock_wall_clock wc; + u32 wc_sec_hi; u64 wall_nsec; - kvm->arch.wall_clock = wall_clock; - if (!wall_clock) return; @@ -1973,6 +1974,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + if (sec_hi_ofs) { + wc_sec_hi = wall_nsec >> 32; + kvm_write_guest(kvm, wall_clock + sec_hi_ofs, + &wc_sec_hi, sizeof(wc_sec_hi)); + } + version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } @@ -2209,7 +2216,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { vcpu->arch.l1_tsc_offset = offset; - vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -2592,13 +2599,15 @@ u64 get_kvmclock_ns(struct kvm *kvm) return ret; } -static void kvm_setup_pvclock_page(struct kvm_vcpu *v) +static void kvm_setup_pvclock_page(struct kvm_vcpu *v, + struct gfn_to_hva_cache *cache, + unsigned int offset) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) + if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, + &guest_hv_clock, offset, sizeof(guest_hv_clock)))) return; /* This VCPU is paused, but it's legal for a guest to read another @@ -2621,9 +2630,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) ++guest_hv_clock.version; /* first time write, random junk */ vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_offset_cached(v->kvm, cache, + &vcpu->hv_clock, offset, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -2637,16 +2646,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_offset_cached(v->kvm, cache, + &vcpu->hv_clock, offset, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_offset_cached(v->kvm, cache, + &vcpu->hv_clock, offset, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2733,7 +2742,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v); + kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); + if (vcpu->xen.vcpu_info_set) + kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (vcpu->xen.vcpu_time_info_set) + kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); if (v == kvm_get_vcpu(v->kvm, 0)) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; @@ -2858,32 +2872,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 - : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; - u32 page_num = data & ~PAGE_MASK; - u64 page_addr = data & PAGE_MASK; - u8 *page; - - if (page_num >= blob_size) - return 1; - - page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) - return PTR_ERR(page); - - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { - kfree(page); - return 1; - } - return 0; -} - static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; @@ -2955,13 +2943,13 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush_all(vcpu); + static_call(kvm_x86_tlb_flush_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush_guest(vcpu); + static_call(kvm_x86_tlb_flush_guest)(vcpu); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -3017,6 +3005,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u32 msr = msr_info->index; u64 data = msr_info->data; + if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + return kvm_xen_write_hypercall_page(vcpu, data); + switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_WRITE: @@ -3073,18 +3064,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case MSR_IA32_DEBUGCTLMSR: - if (!data) { - /* We support the non-activated case already */ - break; - } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { - /* Values other than LBR and BTF are vendor-specific, - thus reserved and should throw a #GP */ - return 1; - } else if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: @@ -3153,13 +3132,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; - kvm_write_wall_clock(vcpu->kvm, data); + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; - kvm_write_wall_clock(vcpu->kvm, data); + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) @@ -3304,8 +3285,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_misc_features_enables = data; break; default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; @@ -3357,7 +3336,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: @@ -3739,7 +3717,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: - case KVM_CAP_XEN_HVM: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: @@ -3779,6 +3756,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; + case KVM_CAP_XEN_HVM: + r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | + KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | + KVM_XEN_HVM_CONFIG_SHARED_INFO; + break; case KVM_CAP_SYNC_REGS: r = KVM_SYNC_X86_VALID_FIELDS; break; @@ -3800,10 +3782,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE); + r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: - r = !kvm_x86_ops.cpu_has_accelerated_tpr(); + r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; @@ -3845,6 +3827,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_STEAL_TIME: r = sched_info_on(); break; + case KVM_CAP_X86_BUS_LOCK_EXIT: + if (kvm_has_bus_lock_exit) + r = KVM_BUS_LOCK_DETECTION_OFF | + KVM_BUS_LOCK_DETECTION_EXIT; + else + r = 0; + break; default: break; } @@ -3962,14 +3951,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops.has_wbinvd_exit()) + if (static_call(kvm_x86_has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops.vcpu_load(vcpu, cpu); + static_call(kvm_x86_vcpu_load)(vcpu, cpu); /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -4015,6 +4004,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; + int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4022,9 +4012,15 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - return; + goto out; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -4032,33 +4028,18 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + +out: + srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - int idx; - if (vcpu->preempted && !vcpu->arch.guest_state_protected) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); + vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); - /* - * Disable page faults because we're in atomic context here. - * kvm_write_guest_offset_cached() would call might_fault() - * that relies on pagefault_disable() to tell if there's a - * bug. NOTE: the write to guest memory may not go through if - * during postcopy live migration or if there's heavy guest - * paging. - */ - pagefault_disable(); - /* - * kvm_memslots() will be called by - * kvm_write_guest_offset_cached() so take the srcu lock. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - pagefault_enable(); - kvm_x86_ops.vcpu_put(vcpu); + static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* * If userspace has set any breakpoints or watchpoints, dr6 is restored @@ -4072,7 +4053,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { if (vcpu->arch.apicv_active) - kvm_x86_ops.sync_pir_to_irr(vcpu); + static_call(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -4182,7 +4163,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - kvm_x86_ops.setup_mce(vcpu); + static_call(kvm_x86_setup_mce)(vcpu); out: return r; } @@ -4289,11 +4270,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); + events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu); + events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -4360,13 +4341,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops.set_interrupt_shadow(vcpu, - events->interrupt.shadow); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked); + static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -4422,9 +4403,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - if (dbgregs->dr6 & ~0xffffffffull) + if (!kvm_dr6_valid(dbgregs->dr6)) return -EINVAL; - if (dbgregs->dr7 & ~0xffffffffull) + if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); @@ -4661,7 +4642,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; - return kvm_x86_ops.enable_direct_tlbflush(vcpu); + return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; @@ -5032,6 +5013,26 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); break; + case KVM_XEN_VCPU_GET_ATTR: { + struct kvm_xen_vcpu_attr xva; + + r = -EFAULT; + if (copy_from_user(&xva, argp, sizeof(xva))) + goto out; + r = kvm_xen_vcpu_get_attr(vcpu, &xva); + if (!r && copy_to_user(argp, &xva, sizeof(xva))) + r = -EFAULT; + break; + } + case KVM_XEN_VCPU_SET_ATTR: { + struct kvm_xen_vcpu_attr xva; + + r = -EFAULT; + if (copy_from_user(&xva, argp, sizeof(xva))) + goto out; + r = kvm_xen_vcpu_set_attr(vcpu, &xva); + break; + } default: r = -EINVAL; } @@ -5053,14 +5054,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops.set_tss_addr(kvm, addr); + ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr); + return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -5217,8 +5218,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ - if (kvm_x86_ops.flush_log_dirty) - kvm_x86_ops.flush_log_dirty(kvm); + static_call_cond(kvm_x86_flush_log_dirty)(kvm); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -5308,6 +5308,20 @@ split_irqchip_unlock: kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; + case KVM_CAP_X86_BUS_LOCK_EXIT: + r = -EINVAL; + if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) + break; + + if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && + (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) + break; + + if (kvm_has_bus_lock_exit && + cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) + kvm->arch.bus_lock_detection_enabled = true; + r = 0; + break; default: r = -EINVAL; break; @@ -5637,11 +5651,27 @@ set_pit2_out: r = -EFAULT; if (copy_from_user(&xhc, argp, sizeof(xhc))) goto out; - r = -EINVAL; - if (xhc.flags) + r = kvm_xen_hvm_config(kvm, &xhc); + break; + } + case KVM_XEN_HVM_GET_ATTR: { + struct kvm_xen_hvm_attr xha; + + r = -EFAULT; + if (copy_from_user(&xha, argp, sizeof(xha))) goto out; - memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc)); - r = 0; + r = kvm_xen_hvm_get_attr(kvm, &xha); + if (!r && copy_to_user(argp, &xha, sizeof(xha))) + r = -EFAULT; + break; + } + case KVM_XEN_HVM_SET_ATTR: { + struct kvm_xen_hvm_attr xha; + + r = -EFAULT; + if (copy_from_user(&xha, argp, sizeof(xha))) + goto out; + r = kvm_xen_hvm_set_attr(kvm, &xha); break; } case KVM_SET_CLOCK: { @@ -5686,7 +5716,7 @@ set_pit2_out: case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) - r = kvm_x86_ops.mem_enc_op(kvm, argp); + r = static_call(kvm_x86_mem_enc_op)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -5698,7 +5728,7 @@ set_pit2_out: r = -ENOTTY; if (kvm_x86_ops.mem_enc_reg_region) - r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion); + r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -5710,7 +5740,7 @@ set_pit2_out: r = -ENOTTY; if (kvm_x86_ops.mem_enc_unreg_region) - r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion); + r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -5812,7 +5842,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i])) + if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -5875,13 +5905,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops.set_segment(vcpu, var, seg); + static_call(kvm_x86_set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops.get_segment(vcpu, var, seg); + static_call(kvm_x86_get_segment)(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -5901,14 +5931,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5916,7 +5946,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5965,7 +5995,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -5990,7 +6020,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -6011,7 +6041,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = 0; - if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) + if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -6064,7 +6094,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) + if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -6089,7 +6119,7 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; - if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0))) return 1; if (force_emulation_prefix && @@ -6123,7 +6153,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -6531,7 +6561,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops.get_segment_base(vcpu, seg); + return static_call(kvm_x86_get_segment_base)(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -6544,7 +6574,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops.has_wbinvd_exit()) { + if (static_call(kvm_x86_has_wbinvd_exit)()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -6571,17 +6601,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) +static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest) { - return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); + return kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -6649,27 +6679,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt)); + return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -6811,7 +6841,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage, + return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, &ctxt->exception); } @@ -6849,7 +6879,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked); + static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -6865,7 +6895,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate); + return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate); } static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) @@ -6927,7 +6957,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); + u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -6938,7 +6968,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops.set_interrupt_shadow(vcpu, mask); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -6980,7 +7010,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); @@ -7041,7 +7071,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) { + if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -7101,9 +7131,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (vcpu->arch.mmu->direct_map) { unsigned int indirect_shadow_pages; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); if (indirect_shadow_pages) kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -7210,7 +7240,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW; kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -7222,10 +7252,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); int r; - r = kvm_x86_ops.skip_emulated_instruction(vcpu); + r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); if (unlikely(!r)) return 0; @@ -7254,7 +7284,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -7311,6 +7341,42 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +/* + * Decode to be emulated instruction. Return EMULATION_OK if success. + */ +int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, + void *insn, int insn_len) +{ + int r = EMULATION_OK; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since we do not set + * complete_userspace_io. This does not handle watchpoints yet, + * those would be handled in the emulate_ops. + */ + if (!(emulation_type & EMULTYPE_SKIP) && + kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; + + r = x86_decode_insn(ctxt, insn, insn_len); + + trace_kvm_emulate_insn_start(vcpu); + ++vcpu->stat.insn_emulation; + + return r; +} +EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); + int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { @@ -7319,7 +7385,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool writeback = true; bool write_fault_to_spt; - if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len))) return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -7330,32 +7396,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; - kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); - - /* - * We will reenter on the same instruction since - * we do not set complete_userspace_io. This does not - * handle watchpoints yet, those would be handled in - * the emulate_ops. - */ - if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_breakpoint(vcpu, &r)) - return r; + kvm_clear_exception_queue(vcpu); - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); - - trace_kvm_emulate_insn_start(vcpu); - ++vcpu->stat.insn_emulation; + r = x86_decode_emulated_instruction(vcpu, emulation_type, + insn, insn_len); if (r != EMULATION_OK) { if ((emulation_type & EMULTYPE_TRAP_UD) || (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { @@ -7462,7 +7508,7 @@ restart: r = 1; if (writeback) { - unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || @@ -7471,7 +7517,7 @@ restart: if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); if (kvm_x86_ops.update_emulated_instruction) - kvm_x86_ops.update_emulated_instruction(vcpu); + static_call(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -7800,7 +7846,7 @@ static int kvm_is_user_mode(void) int user_mode = 3; if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu)); + user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -7945,7 +7991,6 @@ int kvm_arch_init(void *opaque) supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } - kvm_lapic_init(); if (pi_inject_timer == -1) pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); #ifdef CONFIG_X86_64 @@ -7987,6 +8032,7 @@ void kvm_arch_exit(void) kvm_mmu_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_fpu_cache); + WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); } static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) @@ -8038,7 +8084,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) return -KVM_EOPNOTSUPP; - if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + if (!kvm_get_walltime_and_clockread(&ts, &cycle)) return -KVM_EOPNOTSUPP; clock_pairing.sec = ts.tv_sec; @@ -8114,7 +8160,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit; - if (kvm_hv_hypercall_enabled(vcpu->kvm)) + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); nr = kvm_rax_read(vcpu); @@ -8134,7 +8183,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops.get_cpl(vcpu) != 0) { + if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { ret = -KVM_EPERM; goto out; } @@ -8191,7 +8240,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops.patch_hypercall(vcpu, instruction); + static_call(kvm_x86_patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -8215,12 +8264,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->if_flag = !vcpu->arch.guest_state_protected && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + + if (is_smm(vcpu)) + kvm_run->flags |= KVM_RUN_X86_SMM; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -8246,7 +8297,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); + static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); } static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) @@ -8257,7 +8308,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) { - kvm_x86_ops.queue_exception(vcpu); + static_call(kvm_x86_queue_exception)(vcpu); can_inject = false; } /* @@ -8276,10 +8327,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) { - kvm_x86_ops.set_nmi(vcpu); + static_call(kvm_x86_set_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - kvm_x86_ops.set_irq(vcpu); + static_call(kvm_x86_set_irq)(vcpu); can_inject = false; } } @@ -8320,7 +8371,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit } } - kvm_x86_ops.queue_exception(vcpu); + static_call(kvm_x86_queue_exception)(vcpu); can_inject = false; } @@ -8336,7 +8387,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit * The kvm_x86_ops hooks communicate this by returning -EBUSY. */ if (vcpu->arch.smi_pending) { - r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY; + r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) goto busy; if (r) { @@ -8345,35 +8396,35 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit enter_smm(vcpu); can_inject = false; } else - kvm_x86_ops.enable_smi_window(vcpu); + static_call(kvm_x86_enable_smi_window)(vcpu); } if (vcpu->arch.nmi_pending) { - r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY; + r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; if (r < 0) goto busy; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - kvm_x86_ops.set_nmi(vcpu); + static_call(kvm_x86_set_nmi)(vcpu); can_inject = false; - WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0); + WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } if (vcpu->arch.nmi_pending) - kvm_x86_ops.enable_nmi_window(vcpu); + static_call(kvm_x86_enable_nmi_window)(vcpu); } if (kvm_cpu_has_injectable_intr(vcpu)) { - r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY; + r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; if (r < 0) goto busy; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - kvm_x86_ops.set_irq(vcpu); - WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0); + static_call(kvm_x86_set_irq)(vcpu); + WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) - kvm_x86_ops.enable_irq_window(vcpu); + static_call(kvm_x86_enable_irq_window)(vcpu); } if (is_guest_mode(vcpu) && @@ -8398,7 +8449,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -8488,11 +8539,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops.get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops.get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -8542,7 +8593,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops.get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -8552,7 +8603,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops.get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -8582,30 +8633,30 @@ static void enter_smm(struct kvm_vcpu *vcpu) * vCPU state (e.g. leave guest mode) after we've saved the state into * the SMM state-save area. */ - kvm_x86_ops.pre_enter_smm(vcpu, buf); + static_call(kvm_x86_pre_enter_smm)(vcpu, buf); vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops.get_nmi_mask(vcpu)) + if (static_call(kvm_x86_get_nmi_mask)(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops.set_nmi_mask(vcpu, true); + static_call(kvm_x86_set_nmi_mask)(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops.set_cr0(vcpu, cr0); + static_call(kvm_x86_set_cr0)(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops.set_cr4(vcpu, 0); + static_call(kvm_x86_set_cr4)(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops.set_idt(vcpu, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); - __kvm_set_dr(vcpu, 7, DR7_FIXED_1); + kvm_set_dr(vcpu, 7, DR7_FIXED_1); cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; cs.base = vcpu->arch.smbase; @@ -8634,7 +8685,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops.set_efer(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); #endif kvm_update_cpuid_runtime(vcpu); @@ -8672,7 +8723,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); - kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu); + static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -8689,7 +8740,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) unsigned long old, new, expected; if (!kvm_x86_ops.check_apicv_inhibit_reasons || - !kvm_x86_ops.check_apicv_inhibit_reasons(bit)) + !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); @@ -8709,7 +8760,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) trace_kvm_apicv_update_request(activate, bit); if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); + static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); /* * Sending request to update APICV for all other vcpus, @@ -8735,7 +8786,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { if (vcpu->arch.apicv_active) - kvm_x86_ops.sync_pir_to_irr(vcpu); + static_call(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -8753,9 +8804,12 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, - vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); + if (to_hv_vcpu(vcpu)) + bitmap_or((ulong *)eoi_exit_bitmap, + vcpu->arch.ioapic_handled_vectors, + to_hv_synic(vcpu)->vec_bitmap, 256); + + static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -8780,7 +8834,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!kvm_x86_ops.set_apic_access_page_addr) return; - kvm_x86_ops.set_apic_access_page_addr(vcpu); + static_call(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -8905,8 +8959,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv = vcpu->arch.hyperv.exit; + vcpu->run->hyperv = hv_vcpu->exit; r = 0; goto out; } @@ -8923,10 +8979,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_ops.msr_filter_changed(vcpu); + static_call(kvm_x86_msr_filter_changed)(vcpu); } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || + kvm_xen_has_interrupt(vcpu)) { ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -8936,7 +8993,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) inject_pending_event(vcpu, &req_immediate_exit); if (req_int_win) - kvm_x86_ops.enable_irq_window(vcpu); + static_call(kvm_x86_enable_irq_window)(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -8951,7 +9008,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - kvm_x86_ops.prepare_guest_switch(vcpu); + static_call(kvm_x86_prepare_guest_switch)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -8982,7 +9039,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * notified with kvm_vcpu_kick. */ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops.sync_pir_to_irr(vcpu); + static_call(kvm_x86_sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -8996,7 +9053,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops.request_immediate_exit(vcpu); + static_call(kvm_x86_request_immediate_exit)(vcpu); } fpregs_assert_state_consistent(); @@ -9013,7 +9070,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - exit_fastpath = kvm_x86_ops.run(vcpu); + for (;;) { + exit_fastpath = static_call(kvm_x86_run)(vcpu); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { + exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; + break; + } + + if (vcpu->arch.apicv_active) + static_call(kvm_x86_sync_pir_to_irr)(vcpu); + } /* * Do this here before restoring debug registers on the host. And @@ -9023,7 +9092,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops.sync_dirty_debug_regs(vcpu); + static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; @@ -9045,7 +9114,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops.handle_exit_irqoff(vcpu); + static_call(kvm_x86_handle_exit_irqoff)(vcpu); /* * Consume any pending interrupts, including the possible source of @@ -9087,13 +9156,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath); + r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); return r; cancel_injection: if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops.cancel_injection(vcpu); + static_call(kvm_x86_cancel_injection)(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -9103,13 +9172,13 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) { + (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (kvm_x86_ops.post_block) - kvm_x86_ops.post_block(vcpu); + static_call(kvm_x86_post_block)(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -9330,6 +9399,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu_load(vcpu); kvm_sigset_activate(vcpu); + kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { @@ -9504,10 +9574,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops.get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops.get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -9625,7 +9695,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return false; - if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) + if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) return false; } else { /* @@ -9660,10 +9730,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops.set_idt(vcpu, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops.set_gdt(vcpu, &dt); + static_call(kvm_x86_set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; @@ -9673,14 +9743,14 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops.set_efer(vcpu, sregs->efer); + static_call(kvm_x86_set_efer)(vcpu, sregs->efer); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops.set_cr0(vcpu, sregs->cr0); + static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - kvm_x86_ops.set_cr4(vcpu, sregs->cr4); + static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { @@ -9788,7 +9858,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops.update_exception_bitmap(vcpu); + static_call(kvm_x86_update_exception_bitmap)(vcpu); r = 0; @@ -9966,7 +10036,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) vcpu->arch.apicv_active = true; } else - static_key_slow_inc(&kvm_no_apic_vcpu); + static_branch_inc(&kvm_has_noapic_vcpu); r = -ENOMEM; @@ -10004,7 +10074,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -10014,9 +10084,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pending_external_vector = -1; vcpu->arch.preempted_in_kernel = false; - kvm_hv_vcpu_init(vcpu); - - r = kvm_x86_ops.vcpu_create(vcpu); + r = static_call(kvm_x86_vcpu_create)(vcpu); if (r) goto free_guest_fpu; @@ -10052,8 +10120,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - kvm_hv_vcpu_postcreate(vcpu); - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); @@ -10079,7 +10145,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); - kvm_x86_ops.vcpu_free(vcpu); + static_call(kvm_x86_vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); @@ -10096,7 +10162,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); + static_branch_dec(&kvm_has_noapic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -10115,7 +10181,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); - vcpu->arch.dr6 = DR6_INIT; + vcpu->arch.dr6 = DR6_ACTIVE_LOW; vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -10168,7 +10234,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; - kvm_x86_ops.vcpu_reset(vcpu, init_event); + static_call(kvm_x86_vcpu_reset)(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -10194,7 +10260,7 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); - ret = kvm_x86_ops.hardware_enable(); + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; @@ -10276,7 +10342,7 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - kvm_x86_ops.hardware_disable(); + static_call(kvm_x86_hardware_disable)(); drop_user_return_notifiers(); } @@ -10295,6 +10361,7 @@ int kvm_arch_hardware_setup(void *opaque) return r; memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + kvm_ops_static_call_update(); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -10323,7 +10390,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops.hardware_unsetup(); + static_call(kvm_x86_hardware_unsetup)(); } int kvm_arch_check_processor_compat(void *opaque) @@ -10351,8 +10418,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -struct static_key kvm_no_apic_vcpu __read_mostly; -EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); +__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { @@ -10363,12 +10430,12 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) pmu->need_cleanup = true; kvm_make_request(KVM_REQ_PMU, vcpu); } - kvm_x86_ops.sched_in(vcpu, cpu); + static_call(kvm_x86_sched_in)(vcpu, cpu); } void kvm_arch_free_vm(struct kvm *kvm) { - kfree(kvm->arch.hyperv.hv_pa_pg); + kfree(to_kvm_hv(kvm)->hv_pa_pg); vfree(kvm); } @@ -10407,7 +10474,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - return kvm_x86_ops.vm_init(kvm); + return static_call(kvm_x86_vm_init)(kvm); } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -10552,8 +10619,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops.vm_destroy) - kvm_x86_ops.vm_destroy(kvm); + static_call_cond(kvm_x86_vm_destroy)(kvm); for (i = 0; i < kvm->arch.msr_filter.count; i++) kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); @@ -10563,6 +10629,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); + kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); } @@ -10744,7 +10811,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { if (kvm_x86_ops.slot_enable_log_dirty) { - kvm_x86_ops.slot_enable_log_dirty(kvm, new); + static_call(kvm_x86_slot_enable_log_dirty)(kvm, new); } else { int level = kvm_dirty_log_manual_protect_and_init_set(kvm) ? @@ -10761,8 +10828,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, new, level); } } else { - if (kvm_x86_ops.slot_disable_log_dirty) - kvm_x86_ops.slot_disable_log_dirty(kvm, new); + static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new); } } @@ -10801,7 +10867,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && kvm_x86_ops.guest_apic_has_interrupt && - kvm_x86_ops.guest_apic_has_interrupt(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -10820,12 +10886,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops.nmi_allowed(vcpu, false))) + static_call(kvm_x86_nmi_allowed)(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && - kvm_x86_ops.smi_allowed(vcpu, false))) + static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -10859,7 +10925,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu)) + if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; @@ -10877,7 +10943,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.interrupt_allowed(vcpu, false); + return static_call(kvm_x86_interrupt_allowed)(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10903,7 +10969,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = kvm_x86_ops.get_rflags(vcpu); + rflags = static_call(kvm_x86_get_rflags)(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -10915,7 +10981,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops.set_rflags(vcpu, rflags); + static_call(kvm_x86_set_rflags)(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -11045,7 +11111,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) return false; if (!kvm_pv_async_pf_enabled(vcpu) || - (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0)) + (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0)) return false; return true; @@ -11190,7 +11256,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, + ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) @@ -11215,7 +11281,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -11226,7 +11292,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 0f727b50bd3d..39eb04887141 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -98,7 +98,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) if (!is_long_mode(vcpu)) return false; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } @@ -129,7 +129,7 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush_current(vcpu); + static_call(kvm_x86_tlb_flush_current)(vcpu); } static inline int is_pae(struct kvm_vcpu *vcpu) @@ -244,9 +244,10 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) { - return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); + return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu); } +void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -273,6 +274,8 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); +int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, + void *insn, int insn_len); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); @@ -296,6 +299,8 @@ extern int pi_inject_timer; extern struct static_key kvm_no_apic_vcpu; +extern bool report_ignored_msrs; + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, @@ -391,7 +396,6 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c new file mode 100644 index 000000000000..af8f6562fce4 --- /dev/null +++ b/arch/x86/kvm/xen.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. + * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * KVM Xen emulation + */ + +#include "x86.h" +#include "xen.h" +#include "hyperv.h" + +#include <linux/kvm_host.h> + +#include <trace/events/kvm.h> +#include <xen/interface/xen.h> + +#include "trace.h" + +DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); + +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +{ + gpa_t gpa = gfn_to_gpa(gfn); + int wc_ofs, sec_hi_ofs; + int ret; + int idx = srcu_read_lock(&kvm->srcu); + + ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, + gpa, PAGE_SIZE); + if (ret) + goto out; + + kvm->arch.xen.shinfo_set = true; + + /* Paranoia checks on the 32-bit struct layout */ + BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); + BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + /* 32-bit location by default */ + wc_ofs = offsetof(struct compat_shared_info, wc); + sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi); + +#ifdef CONFIG_X86_64 + /* Paranoia checks on the 64-bit struct layout */ + BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); + BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); + + if (kvm->arch.xen.long_mode) { + wc_ofs = offsetof(struct shared_info, wc); + sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi); + } +#endif + + kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs); + kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); + +out: + srcu_read_unlock(&kvm->srcu, idx); + return ret; +} + +int __kvm_xen_has_interrupt(struct kvm_vcpu *v) +{ + u8 rc = 0; + + /* + * If the global upcall vector (HVMIRQ_callback_vector) is set and + * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. + */ + struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; + struct kvm_memslots *slots = kvm_memslots(v->kvm); + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); + + /* No need for compat handling here */ + BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != + offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); + BUILD_BUG_ON(sizeof(rc) != + sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending)); + BUILD_BUG_ON(sizeof(rc) != + sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending)); + + /* + * For efficiency, this mirrors the checks for using the valid + * cache in kvm_read_guest_offset_cached(), but just uses + * __get_user() instead. And falls back to the slow path. + */ + if (likely(slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + /* Fast path */ + __get_user(rc, (u8 __user *)ghc->hva + offset); + } else { + /* Slow path */ + kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, + sizeof(rc)); + } + + return rc; +} + +int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) +{ + int r = -ENOENT; + + mutex_lock(&kvm->lock); + + switch (data->type) { + case KVM_XEN_ATTR_TYPE_LONG_MODE: + if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { + r = -EINVAL; + } else { + kvm->arch.xen.long_mode = !!data->u.long_mode; + r = 0; + } + break; + + case KVM_XEN_ATTR_TYPE_SHARED_INFO: + if (data->u.shared_info.gfn == GPA_INVALID) { + kvm->arch.xen.shinfo_set = false; + r = 0; + break; + } + r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); + break; + + + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: + if (data->u.vector && data->u.vector < 0x10) + r = -EINVAL; + else { + kvm->arch.xen.upcall_vector = data->u.vector; + r = 0; + } + break; + + default: + break; + } + + mutex_unlock(&kvm->lock); + return r; +} + +int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) +{ + int r = -ENOENT; + + mutex_lock(&kvm->lock); + + switch (data->type) { + case KVM_XEN_ATTR_TYPE_LONG_MODE: + data->u.long_mode = kvm->arch.xen.long_mode; + r = 0; + break; + + case KVM_XEN_ATTR_TYPE_SHARED_INFO: + if (kvm->arch.xen.shinfo_set) + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); + else + data->u.shared_info.gfn = GPA_INVALID; + r = 0; + break; + + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: + data->u.vector = kvm->arch.xen.upcall_vector; + r = 0; + break; + + default: + break; + } + + mutex_unlock(&kvm->lock); + return r; +} + +int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) +{ + int idx, r = -ENOENT; + + mutex_lock(&vcpu->kvm->lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (data->type) { + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + /* No compat necessary here. */ + BUILD_BUG_ON(sizeof(struct vcpu_info) != + sizeof(struct compat_vcpu_info)); + + if (data->u.gpa == GPA_INVALID) { + vcpu->arch.xen.vcpu_info_set = false; + break; + } + + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, + sizeof(struct vcpu_info)); + if (!r) { + vcpu->arch.xen.vcpu_info_set = true; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: + if (data->u.gpa == GPA_INVALID) { + vcpu->arch.xen.vcpu_time_info_set = false; + break; + } + + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache, + data->u.gpa, + sizeof(struct pvclock_vcpu_time_info)); + if (!r) { + vcpu->arch.xen.vcpu_time_info_set = true; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + break; + + default: + break; + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + mutex_unlock(&vcpu->kvm->lock); + return r; +} + +int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) +{ + int r = -ENOENT; + + mutex_lock(&vcpu->kvm->lock); + + switch (data->type) { + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + if (vcpu->arch.xen.vcpu_info_set) + data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; + else + data->u.gpa = GPA_INVALID; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: + if (vcpu->arch.xen.vcpu_time_info_set) + data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; + else + data->u.gpa = GPA_INVALID; + r = 0; + break; + + default: + break; + } + + mutex_unlock(&vcpu->kvm->lock); + return r; +} + +int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u32 page_num = data & ~PAGE_MASK; + u64 page_addr = data & PAGE_MASK; + bool lm = is_long_mode(vcpu); + + /* Latch long_mode for shared_info pages etc. */ + vcpu->kvm->arch.xen.long_mode = lm; + + /* + * If Xen hypercall intercept is enabled, fill the hypercall + * page with VMCALL/VMMCALL instructions since that's what + * we catch. Else the VMM has provided the hypercall pages + * with instructions of its own choosing, so use those. + */ + if (kvm_xen_hypercall_enabled(kvm)) { + u8 instructions[32]; + int i; + + if (page_num) + return 1; + + /* mov imm32, %eax */ + instructions[0] = 0xb8; + + /* vmcall / vmmcall */ + kvm_x86_ops.patch_hypercall(vcpu, instructions + 5); + + /* ret */ + instructions[8] = 0xc3; + + /* int3 to pad */ + memset(instructions + 9, 0xcc, sizeof(instructions) - 9); + + for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { + *(u32 *)&instructions[1] = i; + if (kvm_vcpu_write_guest(vcpu, + page_addr + (i * sizeof(instructions)), + instructions, sizeof(instructions))) + return 1; + } + } else { + /* + * Note, truncation is a non-issue as 'lm' is guaranteed to be + * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. + */ + hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 + : kvm->arch.xen_hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 + : kvm->arch.xen_hvm_config.blob_size_32; + u8 *page; + + if (page_num >= blob_size) + return 1; + + blob_addr += page_num * PAGE_SIZE; + + page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE); + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; + } + } + return 0; +} + +int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) +{ + if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) + return -EINVAL; + + /* + * With hypercall interception the kernel generates its own + * hypercall page so it must not be provided. + */ + if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) && + (xhc->blob_addr_32 || xhc->blob_addr_64 || + xhc->blob_size_32 || xhc->blob_size_64)) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (xhc->msr && !kvm->arch.xen_hvm_config.msr) + static_branch_inc(&kvm_xen_enabled.key); + else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) + static_branch_slow_dec_deferred(&kvm_xen_enabled); + + memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); + + mutex_unlock(&kvm->lock); + return 0; +} + +void kvm_xen_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.xen_hvm_config.msr) + static_branch_slow_dec_deferred(&kvm_xen_enabled); +} + +static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) +{ + kvm_rax_write(vcpu, result); + return kvm_skip_emulated_instruction(vcpu); +} + +static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip))) + return 1; + + return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); +} + +int kvm_xen_hypercall(struct kvm_vcpu *vcpu) +{ + bool longmode; + u64 input, params[6]; + + input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); + + /* Hyper-V hypercalls get bit 31 set in EAX */ + if ((input & 0x80000000) && + kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + longmode = is_64_bit_mode(vcpu); + if (!longmode) { + params[0] = (u32)kvm_rbx_read(vcpu); + params[1] = (u32)kvm_rcx_read(vcpu); + params[2] = (u32)kvm_rdx_read(vcpu); + params[3] = (u32)kvm_rsi_read(vcpu); + params[4] = (u32)kvm_rdi_read(vcpu); + params[5] = (u32)kvm_rbp_read(vcpu); + } +#ifdef CONFIG_X86_64 + else { + params[0] = (u64)kvm_rdi_read(vcpu); + params[1] = (u64)kvm_rsi_read(vcpu); + params[2] = (u64)kvm_rdx_read(vcpu); + params[3] = (u64)kvm_r10_read(vcpu); + params[4] = (u64)kvm_r8_read(vcpu); + params[5] = (u64)kvm_r9_read(vcpu); + } +#endif + trace_kvm_xen_hypercall(input, params[0], params[1], params[2], + params[3], params[4], params[5]); + + vcpu->run->exit_reason = KVM_EXIT_XEN; + vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; + vcpu->run->xen.u.hcall.longmode = longmode; + vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu); + vcpu->run->xen.u.hcall.input = input; + vcpu->run->xen.u.hcall.params[0] = params[0]; + vcpu->run->xen.u.hcall.params[1] = params[1]; + vcpu->run->xen.u.hcall.params[2] = params[2]; + vcpu->run->xen.u.hcall.params[3] = params[3]; + vcpu->run->xen.u.hcall.params[4] = params[4]; + vcpu->run->xen.u.hcall.params[5] = params[5]; + vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = + kvm_xen_hypercall_complete_userspace; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h new file mode 100644 index 000000000000..b66a921776f4 --- /dev/null +++ b/arch/x86/kvm/xen.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. + * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * KVM Xen emulation + */ + +#ifndef __ARCH_X86_KVM_XEN_H__ +#define __ARCH_X86_KVM_XEN_H__ + +#include <linux/jump_label_ratelimit.h> + +extern struct static_key_false_deferred kvm_xen_enabled; + +int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); +int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); +int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +int kvm_xen_hypercall(struct kvm_vcpu *vcpu); +int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); +int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_destroy_vm(struct kvm *kvm); + +static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + (kvm->arch.xen_hvm_config.flags & + KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); +} + +static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) +{ + if (static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector) + return __kvm_xen_has_interrupt(vcpu); + + return 0; +} + +/* 32-bit compatibility definitions, also used natively in 32-bit build */ +#include <asm/pvclock-abi.h> +#include <asm/xen/interface.h> + +struct compat_arch_vcpu_info { + unsigned int cr2; + unsigned int pad[5]; +}; + +struct compat_vcpu_info { + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + uint16_t pad; + uint32_t evtchn_pending_sel; + struct compat_arch_vcpu_info arch; + struct pvclock_vcpu_time_info time; +}; /* 64 bytes (x86) */ + +struct compat_arch_shared_info { + unsigned int max_pfn; + unsigned int pfn_to_mfn_frame_list_list; + unsigned int nmi_reason; + unsigned int p2m_cr3; + unsigned int p2m_vaddr; + unsigned int p2m_generation; + uint32_t wc_sec_hi; +}; + +struct compat_shared_info { + struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS]; + uint32_t evtchn_pending[32]; + uint32_t evtchn_mask[32]; + struct pvclock_wall_clock wc; + struct compat_arch_shared_info arch; +}; + +#endif /* __ARCH_X86_KVM_XEN_H__ */ diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index 584b0de6f2ca..41c449ece2d8 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -12,8 +12,8 @@ #define _XTENSA_SPINLOCK_H #include <asm/barrier.h> -#include <asm/qrwlock.h> #include <asm/qspinlock.h> +#include <asm/qrwlock.h> #define smp_mb__after_spinlock() smp_mb() diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 476113e12489..cb9b4c4e371e 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -128,6 +128,7 @@ static int sev_cmd_buffer_len(int cmd) case SEV_CMD_LAUNCH_UPDATE_SECRET: return sizeof(struct sev_data_launch_secret); case SEV_CMD_DOWNLOAD_FIRMWARE: return sizeof(struct sev_data_download_firmware); case SEV_CMD_GET_ID: return sizeof(struct sev_data_get_id); + case SEV_CMD_ATTESTATION_REPORT: return sizeof(struct sev_data_attestation_report); default: return 0; } diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 60f1a386dd06..b4348256ae95 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1703,7 +1703,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn) return -EINVAL; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (kvmgt_gfn_is_write_protected(info, gfn)) goto out; @@ -1712,7 +1712,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn) kvmgt_protect_table_add(info, gfn); out: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); return 0; } @@ -1737,7 +1737,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) return -EINVAL; } - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); if (!kvmgt_gfn_is_write_protected(info, gfn)) goto out; @@ -1746,7 +1746,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn) kvmgt_protect_table_del(info, gfn); out: - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); return 0; } @@ -1772,7 +1772,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvmgt_guest_info *info = container_of(node, struct kvmgt_guest_info, track_node); - spin_lock(&kvm->mmu_lock); + write_lock(&kvm->mmu_lock); for (i = 0; i < slot->npages; i++) { gfn = slot->base_gfn + i; if (kvmgt_gfn_is_write_protected(info, gfn)) { @@ -1781,7 +1781,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, kvmgt_protect_table_del(info, gfn); } } - spin_unlock(&kvm->mmu_lock); + write_unlock(&kvm->mmu_lock); } static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm) @@ -810,11 +810,12 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); /* - * Note because we provide range to follow_pte it will call + * follow_invalidate_pte() will use the range to call * mmu_notifier_invalidate_range_start() on our behalf before * taking any lock. */ - if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl)) + if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, + &pmdp, &ptl)) continue; /* diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 365345f9a9e3..07a36a874dca 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -33,7 +33,7 @@ */ .macro ___EXPORT_SYMBOL name,val,sec -#ifdef CONFIG_MODULES +#if defined(CONFIG_MODULES) && !defined(__DISABLE_EXPORTS) .section ___ksymtab\sec+\name,"a" .balign KSYM_ALIGN __ksymtab_\name: diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 84ce841ce735..7ae0ece07b4e 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -15,6 +15,8 @@ #include <asm-generic/qrwlock_types.h> +/* Must be included from asm/spinlock.h after defining arch_spin_is_locked. */ + /* * Writer states & reader shift and bias. */ @@ -116,15 +118,26 @@ static inline void queued_write_unlock(struct qrwlock *lock) smp_store_release(&lock->wlocked, 0); } +/** + * queued_rwlock_is_contended - check if the lock is contended + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock contended, 0 otherwise + */ +static inline int queued_rwlock_is_contended(struct qrwlock *lock) +{ + return arch_spin_is_locked(&lock->wait_lock); +} + /* * Remapping rwlock architecture specific functions to the corresponding * queue rwlock functions. */ -#define arch_read_lock(l) queued_read_lock(l) -#define arch_write_lock(l) queued_write_lock(l) -#define arch_read_trylock(l) queued_read_trylock(l) -#define arch_write_trylock(l) queued_write_trylock(l) -#define arch_read_unlock(l) queued_read_unlock(l) -#define arch_write_unlock(l) queued_write_unlock(l) +#define arch_read_lock(l) queued_read_lock(l) +#define arch_write_lock(l) queued_write_lock(l) +#define arch_read_trylock(l) queued_read_trylock(l) +#define arch_write_trylock(l) queued_write_trylock(l) +#define arch_read_unlock(l) queued_read_unlock(l) +#define arch_write_unlock(l) queued_write_unlock(l) +#define arch_rwlock_is_contended(l) queued_rwlock_is_contended(l) #endif /* __ASM_GENERIC_QRWLOCK_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f3b1013fb22c..e126ebda36d0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -425,9 +425,8 @@ struct kvm_irq_routing_table { #define KVM_PRIVATE_MEM_SLOTS 0 #endif -#ifndef KVM_MEM_SLOTS_NUM -#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#endif +#define KVM_MEM_SLOTS_NUM SHRT_MAX +#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS) #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) @@ -451,7 +450,12 @@ struct kvm_memslots { }; struct kvm { +#ifdef KVM_HAVE_MMU_RWLOCK + rwlock_t mmu_lock; +#else spinlock_t mmu_lock; +#endif /* KVM_HAVE_MMU_RWLOCK */ + struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; diff --git a/include/linux/mm.h b/include/linux/mm.h index ab959488bc0f..1696ee6ab22d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1663,9 +1663,11 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, + struct mmu_notifier_range *range, pte_t **ptepp, + pmd_t **pmdpp, spinlock_t **ptlp); int follow_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, - spinlock_t **ptlp); + pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 49d155cd2dfe..b801ead1e2bb 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -66,6 +66,7 @@ enum sev_cmd { SEV_CMD_LAUNCH_MEASURE = 0x033, SEV_CMD_LAUNCH_UPDATE_SECRET = 0x034, SEV_CMD_LAUNCH_FINISH = 0x035, + SEV_CMD_ATTESTATION_REPORT = 0x036, /* Guest migration commands (outgoing) */ SEV_CMD_SEND_START = 0x040, @@ -483,6 +484,22 @@ struct sev_data_dbg { u32 len; /* In */ } __packed; +/** + * struct sev_data_attestation_report - SEV_ATTESTATION_REPORT command parameters + * + * @handle: handle of the VM + * @mnonce: a random nonce that will be included in the report. + * @address: physical address where the report will be copied. + * @len: length of the physical buffer. + */ +struct sev_data_attestation_report { + u32 handle; /* In */ + u32 reserved; + u64 address; /* In */ + u8 mnonce[16]; /* In */ + u32 len; /* In/Out */ +} __packed; + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 3dcd617e65ae..7ce9a51ae5c0 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -128,4 +128,11 @@ do { \ 1 : ({ local_irq_restore(flags); 0; }); \ }) +#ifdef arch_rwlock_is_contended +#define rwlock_is_contended(lock) \ + arch_rwlock_is_contended(&(lock)->raw_lock) +#else +#define rwlock_is_contended(lock) ((void)(lock), 0) +#endif /* arch_rwlock_is_contended */ + #endif /* __LINUX_RWLOCK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d568288abf9..26f499810dfa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1904,12 +1904,24 @@ static inline int _cond_resched(void) { return 0; } }) extern int __cond_resched_lock(spinlock_t *lock); +extern int __cond_resched_rwlock_read(rwlock_t *lock); +extern int __cond_resched_rwlock_write(rwlock_t *lock); #define cond_resched_lock(lock) ({ \ ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) +#define cond_resched_rwlock_read(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_read(lock); \ +}) + +#define cond_resched_rwlock_write(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_write(lock); \ +}) + static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) @@ -1933,6 +1945,23 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Check if a rwlock is contended. + * Returns non-zero if there is another task waiting on the rwlock. + * Returns zero if the lock is not contended or the system / underlying + * rwlock implementation does not support contention detection. + * Technically does not depend on CONFIG_PREEMPTION, but a general need + * for low latency. + */ +static inline int rwlock_needbreak(rwlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return rwlock_is_contended(lock); +#else + return 0; +#endif +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 374c67875cdb..8b281f722e5b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -216,6 +216,20 @@ struct kvm_hyperv_exit { } u; }; +struct kvm_xen_exit { +#define KVM_EXIT_XEN_HCALL 1 + __u32 type; + union { + struct { + __u32 longmode; + __u32 cpl; + __u64 input; + __u64 result; + __u64 params[6]; + } hcall; + } u; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -252,6 +266,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_X86_WRMSR 30 #define KVM_EXIT_DIRTY_RING_FULL 31 #define KVM_EXIT_AP_RESET_HOLD 32 +#define KVM_EXIT_X86_BUS_LOCK 33 +#define KVM_EXIT_XEN 34 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -428,6 +444,8 @@ struct kvm_run { __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ } msr; + /* KVM_EXIT_XEN */ + struct kvm_xen_exit xen; /* Fix the size of the union. */ char padding[256]; }; @@ -1058,6 +1076,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #define KVM_CAP_SYS_HYPERV_CPUID 191 #define KVM_CAP_DIRTY_LOG_RING 192 +#define KVM_CAP_X86_BUS_LOCK_EXIT 193 +#define KVM_CAP_PPC_DAWR1 194 #ifdef KVM_CAP_IRQ_ROUTING @@ -1131,6 +1151,10 @@ struct kvm_x86_mce { #endif #ifdef KVM_CAP_XEN_HVM +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) + struct kvm_xen_hvm_config { __u32 flags; __u32 msr; @@ -1565,6 +1589,45 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_DIRTY_LOG_RING */ #define KVM_RESET_DIRTY_RINGS _IO(KVMIO, 0xc7) +/* Per-VM Xen attributes */ +#define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) +#define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + struct { + __u64 gfn; + } shared_info; + __u64 pad[8]; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 + +/* Per-vCPU Xen attributes */ +#define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) +#define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; + __u64 pad[8]; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ @@ -1593,6 +1656,8 @@ enum sev_cmd_id { KVM_SEV_DBG_ENCRYPT, /* Guest certificates commands */ KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, KVM_SEV_NR_MAX, }; @@ -1645,6 +1710,12 @@ struct kvm_sev_dbg { __u32 len; }; +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) @@ -1766,4 +1837,7 @@ struct kvm_dirty_gfn { __u64 offset; }; +#define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) +#define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) + #endif /* __LINUX_KVM_H */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 8bfb242f433e..5ee37a296481 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -598,7 +598,9 @@ struct shared_info { * their gettimeofday() syscall on this wallclock-base value. */ struct pvclock_wall_clock wc; - +#ifndef CONFIG_X86_32 + uint32_t wc_sec_hi; +#endif struct arch_shared_info arch; }; diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fe9ca92faa2a..4786dd271b45 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,7 +12,6 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> -#include <asm/qrwlock.h> /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f5ffc878411..ca2bb629595f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7028,6 +7028,46 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); +int __cond_resched_rwlock_read(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_read(lock); + + if (rwlock_needbreak(lock) || resched) { + read_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + read_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_read); + +int __cond_resched_rwlock_write(rwlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held_write(lock); + + if (rwlock_needbreak(lock) || resched) { + write_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + write_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_rwlock_write); + /** * yield - yield the current processor to other threads. * diff --git a/mm/memory.c b/mm/memory.c index b32f32bf584d..c32318dc11d4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4642,9 +4642,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int follow_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, - spinlock_t **ptlp) +int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, + struct mmu_notifier_range *range, pte_t **ptepp, + pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4710,6 +4710,34 @@ out: } /** + * follow_pte - look up PTE at a user virtual address + * @mm: the mm_struct of the target address space + * @address: user virtual address + * @ptepp: location to store found PTE + * @ptlp: location to store the lock for the PTE + * + * On a successful return, the pointer to the PTE is stored in @ptepp; + * the corresponding lock is taken and its location is stored in @ptlp. + * The contents of the PTE are only stable until @ptlp is released; + * any further use, if any, must be protected against invalidation + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + * it is not a good general-purpose API. + * + * Return: zero on success, -ve otherwise. + */ +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp); +} +EXPORT_SYMBOL_GPL(follow_pte); + +/** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping * @address: user virtual address @@ -4717,6 +4745,9 @@ out: * * Only IO mappings and raw PFN mappings are allowed. * + * This function does not allow the caller to read the permissions + * of the PTE. Do not use it. + * * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, @@ -4729,7 +4760,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) return ret; - ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl); + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; *pfn = pte_pfn(*ptep); @@ -4750,7 +4781,7 @@ int follow_phys(struct vm_area_struct *vma, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl)) + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; pte = *ptep; diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index c3af3f324c5a..9f18fa090f1f 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -644,6 +644,8 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_MMCR3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1) #define KVM_REG_PPC_SIER2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2) #define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) +#define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4) +#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 374c67875cdb..abb89bbe5635 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #define KVM_CAP_SYS_HYPERV_CPUID 191 #define KVM_CAP_DIRTY_LOG_RING 192 +#define KVM_CAP_PPC_DAWR1 194 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index ce8f4ad39684..3a84394829ea 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -7,6 +7,7 @@ /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test +/x86_64/get_cpuid_test /x86_64/kvm_pv_test /x86_64/hyperv_cpuid /x86_64/mmio_warning_test @@ -24,10 +25,15 @@ /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test /x86_64/vmx_tsc_adjust_test +/x86_64/xapic_ipi_test +/x86_64/xen_shinfo_test +/x86_64/xen_vmcall_test /x86_64/xss_msr_test +/x86_64/vmx_pmu_msrs_test /demand_paging_test /dirty_log_test /dirty_log_perf_test /kvm_create_max_vcpus +/memslot_modification_stress_test /set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index fe41c6a0fa67..8c8eda429576 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -40,6 +40,7 @@ LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_ha TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test +TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test @@ -56,13 +57,18 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test +TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test +TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index cdad1eca72f7..5f7a229c3af1 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -64,7 +64,7 @@ static void *vcpu_worker(void *data) exit_reason_str(run->exit_reason)); } - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, ts_diff.tv_sec, ts_diff.tv_nsec); @@ -95,7 +95,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) return r; } - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, timespec_to_ns(ts_diff)); @@ -190,7 +190,7 @@ static void *uffd_handler_thread_fn(void *arg) pages++; } - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", pages, ts_diff.tv_sec, ts_diff.tv_nsec, pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); @@ -250,6 +250,7 @@ static int setup_demand_paging(struct kvm_vm *vm, struct test_params { bool use_uffd; useconds_t uffd_delay; + bool partition_vcpu_memory_access; }; static void run_test(enum vm_guest_mode mode, void *arg) @@ -265,7 +266,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; int r; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size); + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + VM_MEM_SRC_ANONYMOUS); perf_test_args.wr_fract = 1; @@ -277,7 +279,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, + p->partition_vcpu_memory_access); if (p->use_uffd) { uffd_handler_threads = @@ -293,10 +296,19 @@ static void run_test(enum vm_guest_mode mode, void *arg) for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vm_paddr_t vcpu_gpa; void *vcpu_hva; + uint64_t vcpu_mem_size; - vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size); + + if (p->partition_vcpu_memory_access) { + vcpu_gpa = guest_test_phys_mem + + (vcpu_id * guest_percpu_mem_size); + vcpu_mem_size = guest_percpu_mem_size; + } else { + vcpu_gpa = guest_test_phys_mem; + vcpu_mem_size = guest_percpu_mem_size * nr_vcpus; + } PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size); + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size); /* Cache the HVA pointer of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); @@ -313,7 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) &uffd_handler_threads[vcpu_id], pipefds[vcpu_id * 2], p->uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, guest_percpu_mem_size); + vcpu_hva, vcpu_mem_size); if (r < 0) exit(-r); } @@ -339,7 +351,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); } - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); pr_info("All vCPU threads joined\n"); @@ -376,7 +388,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" - " [-b memory] [-v vcpus]\n", name); + " [-b memory] [-v vcpus] [-o]\n", name); guest_modes_help(); printf(" -u: use User Fault FD to handle vCPU page\n" " faults.\n"); @@ -387,6 +399,8 @@ static void help(char *name) " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); puts(""); exit(0); } @@ -394,12 +408,14 @@ static void help(char *name) int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); - struct test_params p = {}; + struct test_params p = { + .partition_vcpu_memory_access = true, + }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) { + while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -419,6 +435,9 @@ int main(int argc, char *argv[]) TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; + case 'o': + p.partition_vcpu_memory_access = false; + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 2283a0ec74a9..04a2641261be 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -28,8 +28,8 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; /* Host variables */ static u64 dirty_log_manual_caps; static bool host_quit; -static uint64_t iteration; -static uint64_t vcpu_last_completed_iteration[KVM_MAX_VCPUS]; +static int iteration; +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; static void *vcpu_worker(void *data) { @@ -48,11 +48,11 @@ static void *vcpu_worker(void *data) run = vcpu_state(vm, vcpu_id); while (!READ_ONCE(host_quit)) { - uint64_t current_iteration = READ_ONCE(iteration); + int current_iteration = READ_ONCE(iteration); clock_gettime(CLOCK_MONOTONIC, &start); ret = _vcpu_run(vm, vcpu_id); - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, @@ -61,17 +61,17 @@ static void *vcpu_worker(void *data) pr_debug("Got sync event from vCPU %d\n", vcpu_id); vcpu_last_completed_iteration[vcpu_id] = current_iteration; - pr_debug("vCPU %d updated last completed iteration to %lu\n", + pr_debug("vCPU %d updated last completed iteration to %d\n", vcpu_id, vcpu_last_completed_iteration[vcpu_id]); if (current_iteration) { pages_count += vcpu_args->pages; total = timespec_add(total, ts_diff); - pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n", + pr_debug("vCPU %d iteration %d dirty memory time: %ld.%.9lds\n", vcpu_id, current_iteration, ts_diff.tv_sec, ts_diff.tv_nsec); } else { - pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n", + pr_debug("vCPU %d iteration %d populate memory time: %ld.%.9lds\n", vcpu_id, current_iteration, ts_diff.tv_sec, ts_diff.tv_nsec); } @@ -81,7 +81,7 @@ static void *vcpu_worker(void *data) } avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]); - pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id], total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); @@ -92,6 +92,8 @@ struct test_params { unsigned long iterations; uint64_t phys_offset; int wr_fract; + bool partition_vcpu_memory_access; + enum vm_mem_backing_src_type backing_src; }; static void run_test(enum vm_guest_mode mode, void *arg) @@ -111,7 +113,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_enable_cap cap = {}; struct timespec clear_dirty_log_total = (struct timespec){0}; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size); + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + p->backing_src); perf_test_args.wr_fract = p->wr_fract; @@ -129,7 +132,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, + p->partition_vcpu_memory_access); sync_global_to_guest(vm, perf_test_args); @@ -139,17 +143,21 @@ static void run_test(enum vm_guest_mode mode, void *arg) clock_gettime(CLOCK_MONOTONIC, &start); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + vcpu_last_completed_iteration[vcpu_id] = -1; + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, &perf_test_args.vcpu_args[vcpu_id]); } - /* Allow the vCPU to populate memory */ - pr_debug("Starting iteration %lu - Populating\n", iteration); - while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) - pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n", - iteration); + /* Allow the vCPUs to populate memory */ + pr_debug("Starting iteration %d - Populating\n", iteration); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != + iteration) + ; + } - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); pr_info("Populate memory time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -157,7 +165,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) clock_gettime(CLOCK_MONOTONIC, &start); vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, KVM_MEM_LOG_DIRTY_PAGES); - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -169,25 +177,25 @@ static void run_test(enum vm_guest_mode mode, void *arg) clock_gettime(CLOCK_MONOTONIC, &start); iteration++; - pr_debug("Starting iteration %lu\n", iteration); + pr_debug("Starting iteration %d\n", iteration); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) - pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n", - vcpu_id, iteration); + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) + != iteration) + ; } - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff); - pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n", + pr_info("Iteration %d dirty memory time: %ld.%.9lds\n", iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap); - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); - pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n", + pr_info("Iteration %d get dirty log time: %ld.%.9lds\n", iteration, ts_diff.tv_sec, ts_diff.tv_nsec); if (dirty_log_manual_caps) { @@ -195,26 +203,26 @@ static void run_test(enum vm_guest_mode mode, void *arg) kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0, host_num_pages); - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); - pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n", + pr_info("Iteration %d clear dirty log time: %ld.%.9lds\n", iteration, ts_diff.tv_sec, ts_diff.tv_nsec); } } - /* Tell the vcpu thread to quit */ - host_quit = true; - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) - pthread_join(vcpu_threads[vcpu_id], NULL); - /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0); - ts_diff = timespec_diff_now(start); + ts_diff = timespec_elapsed(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); + /* Tell the vcpu thread to quit */ + host_quit = true; + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id], NULL); + avg = timespec_div(get_dirty_log_total, p->iterations); pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", p->iterations, get_dirty_log_total.tv_sec, @@ -236,7 +244,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] " - "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name); + "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -251,6 +259,11 @@ static void help(char *name) " 1/<fraction of pages to write>.\n" " (default: 1 i.e. all pages are written to.)\n"); printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + printf(" -s: specify the type of memory that should be used to\n" + " back the guest data region.\n\n"); + backing_src_help(); puts(""); exit(0); } @@ -261,6 +274,8 @@ int main(int argc, char *argv[]) struct test_params p = { .iterations = TEST_HOST_LOOP_N, .wr_fract = 1, + .partition_vcpu_memory_access = true, + .backing_src = VM_MEM_SRC_ANONYMOUS, }; int opt; @@ -271,10 +286,10 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) { + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) { switch (opt) { case 'i': - p.iterations = strtol(optarg, NULL, 10); + p.iterations = atoi(optarg); break; case 'p': p.phys_offset = strtoull(optarg, NULL, 0); @@ -295,6 +310,11 @@ int main(int argc, char *argv[]) TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; + case 'o': + p.partition_vcpu_memory_access = false; + case 's': + p.backing_src = parse_backing_src_type(optarg); + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 5cbb861525ed..2d7eb6989e83 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -79,12 +79,6 @@ struct vm_guest_mode_params { }; extern const struct vm_guest_mode_params vm_guest_mode_params[]; -enum vm_mem_backing_src_type { - VM_MEM_SRC_ANONYMOUS, - VM_MEM_SRC_ANONYMOUS_THP, - VM_MEM_SRC_ANONYMOUS_HUGETLB, -}; - int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h new file mode 100644 index 000000000000..b020547403fd --- /dev/null +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/numaif.h + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Header file that provides access to NUMA API functions not explicitly + * exported to user space. + */ + +#ifndef SELFTEST_KVM_NUMAIF_H +#define SELFTEST_KVM_NUMAIF_H + +#define __NR_get_mempolicy 239 +#define __NR_migrate_pages 256 + +/* System calls */ +long get_mempolicy(int *policy, const unsigned long *nmask, + unsigned long maxnode, void *addr, int flags) +{ + return syscall(__NR_get_mempolicy, policy, nmask, + maxnode, addr, flags); +} + +long migrate_pages(int pid, unsigned long maxnode, + const unsigned long *frommask, + const unsigned long *tomask) +{ + return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); +} + +/* Policies */ +#define MPOL_DEFAULT 0 +#define MPOL_PREFERRED 1 +#define MPOL_BIND 2 +#define MPOL_INTERLEAVE 3 + +#define MPOL_MAX MPOL_INTERLEAVE + +/* Flags for get_mem_policy */ +#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ + /* Warning: MPOL_F_NODE is unsupported and + * subject to change. Don't use. + */ +#define MPOL_F_ADDR (1<<1) /* look up vma using address */ +#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ + +/* Flags for mbind */ +#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ + +#endif /* SELFTEST_KVM_NUMAIF_H */ diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index b1188823c31b..005f2143adeb 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -44,8 +44,11 @@ extern struct perf_test_args perf_test_args; extern uint64_t guest_test_phys_mem; struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes); + uint64_t vcpu_memory_bytes, + enum vm_mem_backing_src_type backing_src); void perf_test_destroy_vm(struct kvm_vm *vm); -void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes); +void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, + uint64_t vcpu_memory_bytes, + bool partition_vcpu_memory_access); #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index ffffa560436b..b7f41399f22c 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -64,7 +64,21 @@ int64_t timespec_to_ns(struct timespec ts); struct timespec timespec_add_ns(struct timespec ts, int64_t ns); struct timespec timespec_add(struct timespec ts1, struct timespec ts2); struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); -struct timespec timespec_diff_now(struct timespec start); +struct timespec timespec_elapsed(struct timespec start); struct timespec timespec_div(struct timespec ts, int divisor); +enum vm_mem_backing_src_type { + VM_MEM_SRC_ANONYMOUS, + VM_MEM_SRC_ANONYMOUS_THP, + VM_MEM_SRC_ANONYMOUS_HUGETLB, +}; + +struct vm_mem_backing_src_alias { + const char *name; + enum vm_mem_backing_src_type type; +}; + +void backing_src_help(void); +enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 90cd5984751b..0b30b4e15c38 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -263,6 +263,19 @@ static inline void outl(uint16_t port, uint32_t value) __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value)); } +static inline void cpuid(uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) @@ -338,8 +351,10 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); struct kvm_msr_list *kvm_get_msr_index_list(void); - +uint64_t kvm_get_feature_msr(uint64_t msr_index); struct kvm_cpuid2 *kvm_get_supported_cpuid(void); + +struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid); @@ -391,6 +406,10 @@ bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); +struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); +void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); + /* * Basic CPU control in CR0 */ @@ -406,8 +425,27 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, #define X86_CR0_CD (1UL<<30) /* Cache Disable */ #define X86_CR0_PG (1UL<<31) /* Paging */ +#define APIC_DEFAULT_GPA 0xfee00000ULL + +/* APIC base address MSR and fields */ +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_EXTD (1<<10) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define GET_APIC_BASE(x) (((x) >> 12) << 12) + #define APIC_BASE_MSR 0x800 #define X2APIC_ENABLE (1UL << 10) +#define APIC_ID 0x20 +#define APIC_LVR 0x30 +#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) +#define APIC_TASKPRI 0x80 +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 @@ -432,6 +470,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 +#define SET_APIC_DEST_FIELD(x) ((x) << 24) /* VMX_EPT_VPID_CAP bits */ #define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index fa5a90e6c6f0..d787cb802b4a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1801,6 +1801,7 @@ static struct exit_reason { {KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"}, {KVM_EXIT_X86_RDMSR, "RDMSR"}, {KVM_EXIT_X86_WRMSR, "WRMSR"}, + {KVM_EXIT_XEN, "XEN"}, #ifdef KVM_EXIT_MEMORY_NOT_PRESENT {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"}, #endif diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 9be1944c2d1c..81490b9b4e32 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -49,7 +49,8 @@ static void guest_code(uint32_t vcpu_id) } struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes) + uint64_t vcpu_memory_bytes, + enum vm_mem_backing_src_type backing_src) { struct kvm_vm *vm; uint64_t guest_num_pages; @@ -93,8 +94,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); /* Add an extra memory slot for testing */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, + vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem, PERF_TEST_MEM_SLOT_INDEX, guest_num_pages, 0); @@ -112,7 +112,9 @@ void perf_test_destroy_vm(struct kvm_vm *vm) kvm_vm_free(vm); } -void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) +void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, + uint64_t vcpu_memory_bytes, + bool partition_vcpu_memory_access) { vm_paddr_t vcpu_gpa; struct perf_test_vcpu_args *vcpu_args; @@ -122,13 +124,22 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_by vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; vcpu_args->vcpu_id = vcpu_id; - vcpu_args->gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); - vcpu_args->pages = vcpu_memory_bytes / - perf_test_args.guest_page_size; + if (partition_vcpu_memory_access) { + vcpu_args->gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + perf_test_args.guest_page_size; + vcpu_gpa = guest_test_phys_mem + + (vcpu_id * vcpu_memory_bytes); + } else { + vcpu_args->gva = guest_test_virt_mem; + vcpu_args->pages = (vcpus * vcpu_memory_bytes) / + perf_test_args.guest_page_size; + vcpu_gpa = guest_test_phys_mem; + } - vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + vcpu_id, vcpu_gpa, vcpu_gpa + + (vcpu_args->pages * perf_test_args.guest_page_size)); } } diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 8e04c0b1608e..906c955384e2 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -10,6 +10,7 @@ #include <limits.h> #include <stdlib.h> #include <time.h> +#include "linux/kernel.h" #include "test_util.h" @@ -84,7 +85,7 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) return timespec_add_ns((struct timespec){0}, ns1 - ns2); } -struct timespec timespec_diff_now(struct timespec start) +struct timespec timespec_elapsed(struct timespec start) { struct timespec end; @@ -109,3 +110,31 @@ void print_skip(const char *fmt, ...) va_end(ap); puts(", skipping test"); } + +const struct vm_mem_backing_src_alias backing_src_aliases[] = { + {"anonymous", VM_MEM_SRC_ANONYMOUS,}, + {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,}, + {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,}, +}; + +void backing_src_help(void) +{ + int i; + + printf("Available backing src types:\n"); + for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++) + printf("\t%s\n", backing_src_aliases[i].name); +} + +enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++) + if (!strcmp(type_name, backing_src_aliases[i].name)) + return backing_src_aliases[i].type; + + backing_src_help(); + TEST_FAIL("Unknown backing src type: %s", type_name); + return -1; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 95e1a757c629..de0c76177d02 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -670,6 +670,82 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) } /* + * KVM Get MSR + * + * Input Args: + * msr_index - Index of MSR + * + * Output Args: None + * + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. + * + * Get value of MSR for VCPU. + */ +uint64_t kvm_get_feature_msr(uint64_t msr_index) +{ + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r, kvm_fd; + + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); + + close(kvm_fd); + return buffer.entry.data; +} + +/* + * VM VCPU CPUID Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU id + * + * Output Args: None + * + * Return: KVM CPUID (KVM_GET_CPUID2) + * + * Set the VCPU's CPUID. + */ +struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_cpuid2 *cpuid; + int rc, max_ent; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + cpuid = allocate_kvm_cpuid2(); + max_ent = cpuid->nent; + + for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) { + rc = ioctl(vcpu->fd, KVM_GET_CPUID2, cpuid); + if (!rc) + break; + + TEST_ASSERT(rc == -1 && errno == E2BIG, + "KVM_GET_CPUID2 should either succeed or give E2BIG: %d %d", + rc, errno); + } + + TEST_ASSERT(rc == 0, "KVM_GET_CPUID2 failed, rc: %i errno: %i", + rc, errno); + + return cpuid; +} + + + +/* * Locate a cpuid entry. * * Input Args: @@ -1224,3 +1300,71 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, : "b"(a0), "c"(a1), "d"(a2), "S"(a3)); return r; } + +struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int ret; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(); + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n", + ret, errno); + + close(kvm_fd); + return cpuid; +} + +void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid) +{ + static struct kvm_cpuid2 *cpuid_full; + struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; + int i, nent = 0; + + if (!cpuid_full) { + cpuid_sys = kvm_get_supported_cpuid(); + cpuid_hv = kvm_get_supported_hv_cpuid(); + + cpuid_full = malloc(sizeof(*cpuid_full) + + (cpuid_sys->nent + cpuid_hv->nent) * + sizeof(struct kvm_cpuid_entry2)); + if (!cpuid_full) { + perror("malloc"); + abort(); + } + + /* Need to skip KVM CPUID leaves 0x400000xx */ + for (i = 0; i < cpuid_sys->nent; i++) { + if (cpuid_sys->entries[i].function >= 0x40000000 && + cpuid_sys->entries[i].function < 0x40000100) + continue; + cpuid_full->entries[nent] = cpuid_sys->entries[i]; + nent++; + } + + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); + cpuid_full->nent = nent + cpuid_hv->nent; + } + + vcpu_set_cpuid(vm, vcpuid, cpuid_full); +} + +struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid) +{ + static struct kvm_cpuid2 *cpuid; + + cpuid = allocate_kvm_cpuid2(); + + vcpu_ioctl(vm, vcpuid, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + return cpuid; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 3a5c72ed2b79..827fe6028dd4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -74,7 +74,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); memset(vmcb, 0, sizeof(*vmcb)); - asm volatile ("vmsave\n\t" : : "a" (vmcb_gpa) : "memory"); + asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory"); vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr); @@ -131,19 +131,19 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa) { asm volatile ( - "vmload\n\t" + "vmload %[vmcb_gpa]\n\t" "mov rflags, %%r15\n\t" // rflags "mov %%r15, 0x170(%[vmcb])\n\t" "mov guest_regs, %%r15\n\t" // rax "mov %%r15, 0x1f8(%[vmcb])\n\t" LOAD_GPR_C - "vmrun\n\t" + "vmrun %[vmcb_gpa]\n\t" SAVE_GPR_C "mov 0x170(%[vmcb]), %%r15\n\t" // rflags "mov %%r15, rflags\n\t" "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax "mov %%r15, guest_regs\n\t" - "vmsave\n\t" + "vmsave %[vmcb_gpa]\n\t" : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa) : "r15", "memory"); } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c new file mode 100644 index 000000000000..6096bf0a5b34 --- /dev/null +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM memslot modification stress test + * Adapted from demand_paging_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2020, Google, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <asm/unistd.h> +#include <time.h> +#include <poll.h> +#include <pthread.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/userfaultfd.h> + +#include "perf_test_util.h" +#include "processor.h" +#include "test_util.h" +#include "guest_modes.h" + +#define DUMMY_MEMSLOT_INDEX 7 + +#define DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS 10 + + +static int nr_vcpus = 1; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + +static bool run_vcpus = true; + +static void *vcpu_worker(void *data) +{ + int ret; + struct perf_test_vcpu_args *vcpu_args = + (struct perf_test_vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + struct kvm_vm *vm = perf_test_args.vm; + struct kvm_run *run; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + /* Let the guest access its memory until a stop signal is received */ + while (READ_ONCE(run_vcpus)) { + ret = _vcpu_run(vm, vcpu_id); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + + if (get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC) + continue; + + TEST_ASSERT(false, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + } + + return NULL; +} + +struct memslot_antagonist_args { + struct kvm_vm *vm; + useconds_t delay; + uint64_t nr_modifications; +}; + +static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, + uint64_t nr_modifications, uint64_t gpa) +{ + int i; + + for (i = 0; i < nr_modifications; i++) { + usleep(delay); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, + DUMMY_MEMSLOT_INDEX, 1, 0); + + vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX); + } +} + +struct test_params { + useconds_t memslot_modification_delay; + uint64_t nr_memslot_modifications; + bool partition_vcpu_memory_access; +}; + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *p = arg; + pthread_t *vcpu_threads; + struct kvm_vm *vm; + int vcpu_id; + + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + VM_MEM_SRC_ANONYMOUS); + + perf_test_args.wr_fract = 1; + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, + p->partition_vcpu_memory_access); + + /* Export the shared variables to the guest */ + sync_global_to_guest(vm, perf_test_args); + + pr_info("Finished creating vCPUs\n"); + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &perf_test_args.vcpu_args[vcpu_id]); + + pr_info("Started all vCPUs\n"); + + add_remove_memslot(vm, p->memslot_modification_delay, + p->nr_memslot_modifications, + guest_test_phys_mem + + (guest_percpu_mem_size * nr_vcpus) + + perf_test_args.host_page_size + + perf_test_args.guest_page_size); + + run_vcpus = false; + + /* Wait for the vcpu threads to quit */ + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id], NULL); + + pr_info("All vCPU threads joined\n"); + + ucall_uninit(vm); + kvm_vm_free(vm); + + free(vcpu_threads); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-m mode] [-d delay_usec]\n" + " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); + guest_modes_help(); + printf(" -d: add a delay between each iteration of adding and\n" + " deleting a memslot in usec.\n"); + printf(" -b: specify the size of the memory region which should be\n" + " accessed by each vCPU. e.g. 10M or 3G.\n" + " Default: 1G\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + printf(" -i: specify the number of iterations of adding and removing\n" + " a memslot.\n" + " Default: %d\n", DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + int opt; + struct test_params p = { + .memslot_modification_delay = 0, + .nr_memslot_modifications = + DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS, + .partition_vcpu_memory_access = true + }; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 'd': + p.memslot_modification_delay = strtoul(optarg, NULL, 0); + TEST_ASSERT(p.memslot_modification_delay >= 0, + "A negative delay is not supported."); + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'v': + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", + max_vcpus); + break; + case 'o': + p.partition_vcpu_memory_access = false; + break; + case 'i': + p.nr_memslot_modifications = atoi(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + for_each_guest_mode(run_test, &p); + + return 0; +} diff --git a/tools/testing/selftests/kvm/settings b/tools/testing/selftests/kvm/settings new file mode 100644 index 000000000000..6091b45d226b --- /dev/null +++ b/tools/testing/selftests/kvm/settings @@ -0,0 +1 @@ +timeout=120 diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 37b8a78f6b74..ca22ee6d19cb 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -99,6 +99,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } + vcpu_set_hv_cpuid(vm, VCPU_ID); vcpu_enable_evmcs(vm, VCPU_ID); run = vcpu_state(vm, VCPU_ID); @@ -142,7 +143,7 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ kvm_vm_restart(vm, O_RDWR); vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_set_hv_cpuid(vm, VCPU_ID); vcpu_enable_evmcs(vm, VCPU_ID); vcpu_load_state(vm, VCPU_ID, state); run = vcpu_state(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c new file mode 100644 index 000000000000..9b78e8889638 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat Inc. + * + * Generic tests for KVM CPUID set/get ioctls + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +/* CPUIDs known to differ */ +struct { + u32 function; + u32 index; +} mangled_cpuids[] = { + {.function = 0xd, .index = 0}, +}; + +static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) +{ + int i; + u32 eax, ebx, ecx, edx; + + for (i = 0; i < guest_cpuid->nent; i++) { + eax = guest_cpuid->entries[i].function; + ecx = guest_cpuid->entries[i].index; + + cpuid(&eax, &ebx, &ecx, &edx); + + GUEST_ASSERT(eax == guest_cpuid->entries[i].eax && + ebx == guest_cpuid->entries[i].ebx && + ecx == guest_cpuid->entries[i].ecx && + edx == guest_cpuid->entries[i].edx); + } + +} + +static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid) +{ + u32 eax = 0x40000000, ebx, ecx = 0, edx; + + cpuid(&eax, &ebx, &ecx, &edx); + + GUEST_ASSERT(eax == 0x40000001); +} + +static void guest_main(struct kvm_cpuid2 *guest_cpuid) +{ + GUEST_SYNC(1); + + test_guest_cpuids(guest_cpuid); + + GUEST_SYNC(2); + + test_cpuid_40000000(guest_cpuid); + + GUEST_DONE(); +} + +static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie) +{ + int i; + + for (i = 0; i < sizeof(mangled_cpuids); i++) { + if (mangled_cpuids[i].function == entrie->function && + mangled_cpuids[i].index == entrie->index) + return true; + } + + return false; +} + +static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == entrie->function && + cpuid->entries[i].index == entrie->index) { + if (is_cpuid_mangled(entrie)) + return; + + TEST_ASSERT(cpuid->entries[i].eax == entrie->eax && + cpuid->entries[i].ebx == entrie->ebx && + cpuid->entries[i].ecx == entrie->ecx && + cpuid->entries[i].edx == entrie->edx, + "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x", + entrie->function, entrie->index, + cpuid->entries[i].eax, cpuid->entries[i].ebx, + cpuid->entries[i].ecx, cpuid->entries[i].edx, + entrie->eax, entrie->ebx, entrie->ecx, entrie->edx); + return; + } + } + + TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index); +} + +static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) +{ + int i; + + for (i = 0; i < cpuid1->nent; i++) + check_cpuid(cpuid2, &cpuid1->entries[i]); + + for (i = 0; i < cpuid2->nent; i++) + check_cpuid(cpuid1, &cpuid2->entries[i]); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) +{ + struct ucall uc; + + _vcpu_run(vm, vcpuid); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + return; + case UCALL_DONE: + return; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } +} + +struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid) +{ + int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]); + vm_vaddr_t gva = vm_vaddr_alloc(vm, size, + getpagesize(), 0, 0); + struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva); + + memcpy(guest_cpuids, cpuid, size); + + *p_gva = gva; + return guest_cpuids; +} + +int main(void) +{ + struct kvm_cpuid2 *supp_cpuid, *cpuid2; + vm_vaddr_t cpuid_gva; + struct kvm_vm *vm; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + + supp_cpuid = kvm_get_supported_cpuid(); + cpuid2 = vcpu_get_cpuid(vm, VCPU_ID); + + compare_cpuids(supp_cpuid, cpuid2); + + vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2); + + vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva); + + for (stage = 0; stage < 3; stage++) + run_vcpu(vm, VCPU_ID, stage); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 88a595b7fbdd..7e2d2d17d2ed 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -125,30 +125,6 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system) " it should have: %d %d", system ? "KVM" : "vCPU", ret, errno); } - -struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm, bool system) -{ - int nent = 20; /* should be enough */ - static struct kvm_cpuid2 *cpuid; - - cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2)); - - if (!cpuid) { - perror("malloc"); - abort(); - } - - cpuid->nent = nent; - - if (!system) - vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - else - kvm_ioctl(vm, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - - return cpuid; -} - - int main(int argc, char *argv[]) { struct kvm_vm *vm; @@ -167,7 +143,7 @@ int main(int argc, char *argv[]) /* Test vCPU ioctl version */ test_hv_cpuid_e2big(vm, false); - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, false); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID); test_hv_cpuid(hv_cpuid_entries, false); free(hv_cpuid_entries); @@ -177,7 +153,7 @@ int main(int argc, char *argv[]) goto do_sys; } vcpu_enable_evmcs(vm, VCPU_ID); - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, false); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID); test_hv_cpuid(hv_cpuid_entries, true); free(hv_cpuid_entries); @@ -190,9 +166,8 @@ do_sys: test_hv_cpuid_e2big(vm, true); - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, true); + hv_cpuid_entries = kvm_get_supported_hv_cpuid(); test_hv_cpuid(hv_cpuid_entries, nested_vmx_supported()); - free(hv_cpuid_entries); out: kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c new file mode 100644 index 000000000000..23051d84b907 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * VMX-pmu related msrs test + * + * Copyright (C) 2021 Intel Corporation + * + * Test to check the effect of various CPUID settings + * on the MSR_IA32_PERF_CAPABILITIES MSR, and check that + * whatever we write with KVM_SET_MSR is _not_ modified + * in the guest and test it can be retrieved with KVM_GET_MSR. + * + * Test to check that invalid LBR formats are rejected. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <sys/ioctl.h> + +#include "kvm_util.h" +#include "vmx.h" + +#define VCPU_ID 0 + +#define X86_FEATURE_PDCM (1<<15) +#define PMU_CAP_FW_WRITES (1ULL << 13) +#define PMU_CAP_LBR_FMT 0x3f + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union perf_capabilities { + struct { + u64 lbr_format:6; + u64 pebs_trap:1; + u64 pebs_arch_reg:1; + u64 pebs_format:4; + u64 smm_freeze:1; + u64 full_width_write:1; + u64 pebs_baseline:1; + u64 perf_metrics:1; + u64 pebs_output_pt_available:1; + u64 anythread_deprecated:1; + }; + u64 capabilities; +}; + +static void guest_code(void) +{ + wrmsr(MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT); +} + +int main(int argc, char *argv[]) +{ + struct kvm_cpuid2 *cpuid; + struct kvm_cpuid_entry2 *entry_1_0; + struct kvm_cpuid_entry2 *entry_a_0; + bool pdcm_supported = false; + struct kvm_vm *vm; + int ret; + union cpuid10_eax eax; + union perf_capabilities host_cap; + + host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES); + host_cap.capabilities &= (PMU_CAP_FW_WRITES | PMU_CAP_LBR_FMT); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + cpuid = kvm_get_supported_cpuid(); + + if (kvm_get_cpuid_max_basic() >= 0xa) { + entry_1_0 = kvm_get_supported_cpuid_index(1, 0); + entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0); + pdcm_supported = entry_1_0 && !!(entry_1_0->ecx & X86_FEATURE_PDCM); + eax.full = entry_a_0->eax; + } + if (!pdcm_supported) { + print_skip("MSR_IA32_PERF_CAPABILITIES is not supported by the vCPU"); + exit(KSFT_SKIP); + } + if (!eax.split.version_id) { + print_skip("PMU is not supported by the vCPU"); + exit(KSFT_SKIP); + } + + /* testcase 1, set capabilities when we have PDCM bit */ + vcpu_set_cpuid(vm, VCPU_ID, cpuid); + vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); + + /* check capabilities can be retrieved with KVM_GET_MSR */ + ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); + + /* check whatever we write with KVM_SET_MSR is _not_ modified */ + vcpu_run(vm, VCPU_ID); + ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); + + /* testcase 2, check valid LBR formats are accepted */ + vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0); + ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0); + + vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format); + ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format); + + /* testcase 3, check invalid LBR format is rejected */ + ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT); + TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); + + /* testcase 4, set capabilities when we don't have PDCM bit */ + entry_1_0->ecx &= ~X86_FEATURE_PDCM; + vcpu_set_cpuid(vm, VCPU_ID, cpuid); + ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); + TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); + + /* testcase 5, set capabilities when we don't have PMU version bits */ + entry_1_0->ecx |= X86_FEATURE_PDCM; + eax.split.version_id = 0; + entry_1_0->ecx = eax.full; + vcpu_set_cpuid(vm, VCPU_ID, cpuid); + ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); + TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); + + vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0); + ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c new file mode 100644 index 000000000000..2f964cdc273c --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * xapic_ipi_test + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake + * another vCPU that is halted when KVM's backing page for the APIC access + * address has been moved by mm. + * + * The test starts two vCPUs: one that sends IPIs and one that continually + * executes HLT. The sender checks that the halter has woken from the HLT and + * has reentered HLT before sending the next IPI. While the vCPUs are running, + * the host continually calls migrate_pages to move all of the process' pages + * amongst the available numa nodes on the machine. + * + * Migration is a command line option. When used on non-numa machines will + * exit with error. Test is still usefull on non-numa for testing IPIs. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <getopt.h> +#include <pthread.h> +#include <inttypes.h> +#include <string.h> +#include <time.h> + +#include "kvm_util.h" +#include "numaif.h" +#include "processor.h" +#include "test_util.h" +#include "vmx.h" + +/* Default running time for the test */ +#define DEFAULT_RUN_SECS 3 + +/* Default delay between migrate_pages calls (microseconds) */ +#define DEFAULT_DELAY_USECS 500000 + +#define HALTER_VCPU_ID 0 +#define SENDER_VCPU_ID 1 + +volatile uint32_t *apic_base = (volatile uint32_t *)APIC_DEFAULT_GPA; + +/* + * Vector for IPI from sender vCPU to halting vCPU. + * Value is arbitrary and was chosen for the alternating bit pattern. Any + * value should work. + */ +#define IPI_VECTOR 0xa5 + +/* + * Incremented in the IPI handler. Provides evidence to the sender that the IPI + * arrived at the destination + */ +static volatile uint64_t ipis_rcvd; + +/* Data struct shared between host main thread and vCPUs */ +struct test_data_page { + uint32_t halter_apic_id; + volatile uint64_t hlt_count; + volatile uint64_t wake_count; + uint64_t ipis_sent; + uint64_t migrations_attempted; + uint64_t migrations_completed; + uint32_t icr; + uint32_t icr2; + uint32_t halter_tpr; + uint32_t halter_ppr; + + /* + * Record local version register as a cross-check that APIC access + * worked. Value should match what KVM reports (APIC_VERSION in + * arch/x86/kvm/lapic.c). If test is failing, check that values match + * to determine whether APIC access exits are working. + */ + uint32_t halter_lvr; +}; + +struct thread_params { + struct test_data_page *data; + struct kvm_vm *vm; + uint32_t vcpu_id; + uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */ +}; + +uint32_t read_apic_reg(uint reg) +{ + return apic_base[reg >> 2]; +} + +void write_apic_reg(uint reg, uint32_t val) +{ + apic_base[reg >> 2] = val; +} + +void disable_apic(void) +{ + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) & + ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); +} + +void enable_xapic(void) +{ + uint64_t val = rdmsr(MSR_IA32_APICBASE); + + /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ + if (val & MSR_IA32_APICBASE_EXTD) { + disable_apic(); + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); + } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { + wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); + } + + /* + * Per SDM: reset value of spurious interrupt vector register has the + * APIC software enabled bit=0. It must be enabled in addition to the + * enable bit in the MSR. + */ + val = read_apic_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; + write_apic_reg(APIC_SPIV, val); +} + +void verify_apic_base_addr(void) +{ + uint64_t msr = rdmsr(MSR_IA32_APICBASE); + uint64_t base = GET_APIC_BASE(msr); + + GUEST_ASSERT(base == APIC_DEFAULT_GPA); +} + +static void halter_guest_code(struct test_data_page *data) +{ + verify_apic_base_addr(); + enable_xapic(); + + data->halter_apic_id = GET_APIC_ID_FIELD(read_apic_reg(APIC_ID)); + data->halter_lvr = read_apic_reg(APIC_LVR); + + /* + * Loop forever HLTing and recording halts & wakes. Disable interrupts + * each time around to minimize window between signaling the pending + * halt to the sender vCPU and executing the halt. No need to disable on + * first run as this vCPU executes first and the host waits for it to + * signal going into first halt before starting the sender vCPU. Record + * TPR and PPR for diagnostic purposes in case the test fails. + */ + for (;;) { + data->halter_tpr = read_apic_reg(APIC_TASKPRI); + data->halter_ppr = read_apic_reg(APIC_PROCPRI); + data->hlt_count++; + asm volatile("sti; hlt; cli"); + data->wake_count++; + } +} + +/* + * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to + * enable diagnosing errant writes to the APIC access address backing page in + * case of test failure. + */ +static void guest_ipi_handler(struct ex_regs *regs) +{ + ipis_rcvd++; + write_apic_reg(APIC_EOI, 77); +} + +static void sender_guest_code(struct test_data_page *data) +{ + uint64_t last_wake_count; + uint64_t last_hlt_count; + uint64_t last_ipis_rcvd_count; + uint32_t icr_val; + uint32_t icr2_val; + uint64_t tsc_start; + + verify_apic_base_addr(); + enable_xapic(); + + /* + * Init interrupt command register for sending IPIs + * + * Delivery mode=fixed, per SDM: + * "Delivers the interrupt specified in the vector field to the target + * processor." + * + * Destination mode=physical i.e. specify target by its local APIC + * ID. This vCPU assumes that the halter vCPU has already started and + * set data->halter_apic_id. + */ + icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR); + icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id); + data->icr = icr_val; + data->icr2 = icr2_val; + + last_wake_count = data->wake_count; + last_hlt_count = data->hlt_count; + last_ipis_rcvd_count = ipis_rcvd; + for (;;) { + /* + * Send IPI to halter vCPU. + * First IPI can be sent unconditionally because halter vCPU + * starts earlier. + */ + write_apic_reg(APIC_ICR2, icr2_val); + write_apic_reg(APIC_ICR, icr_val); + data->ipis_sent++; + + /* + * Wait up to ~1 sec for halter to indicate that it has: + * 1. Received the IPI + * 2. Woken up from the halt + * 3. Gone back into halt + * Current CPUs typically run at 2.x Ghz which is ~2 + * billion ticks per second. + */ + tsc_start = rdtsc(); + while (rdtsc() - tsc_start < 2000000000) { + if ((ipis_rcvd != last_ipis_rcvd_count) && + (data->wake_count != last_wake_count) && + (data->hlt_count != last_hlt_count)) + break; + } + + GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) && + (data->wake_count != last_wake_count) && + (data->hlt_count != last_hlt_count)); + + last_wake_count = data->wake_count; + last_hlt_count = data->hlt_count; + last_ipis_rcvd_count = ipis_rcvd; + } +} + +static void *vcpu_thread(void *arg) +{ + struct thread_params *params = (struct thread_params *)arg; + struct ucall uc; + int old; + int r; + unsigned int exit_reason; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(r == 0, + "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", + params->vcpu_id, r); + + fprintf(stderr, "vCPU thread running vCPU %u\n", params->vcpu_id); + vcpu_run(params->vm, params->vcpu_id); + exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason; + + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", + params->vcpu_id, exit_reason, exit_reason_str(exit_reason)); + + if (get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_ABORT) { + TEST_ASSERT(false, + "vCPU %u exited with error: %s.\n" + "Sending vCPU sent %lu IPIs to halting vCPU\n" + "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n" + "Halter TPR=%#x PPR=%#x LVR=%#x\n" + "Migrations attempted: %lu\n" + "Migrations completed: %lu\n", + params->vcpu_id, (const char *)uc.args[0], + params->data->ipis_sent, params->data->hlt_count, + params->data->wake_count, + *params->pipis_rcvd, params->data->halter_tpr, + params->data->halter_ppr, params->data->halter_lvr, + params->data->migrations_attempted, + params->data->migrations_completed); + } + + return NULL; +} + +static void cancel_join_vcpu_thread(pthread_t thread, uint32_t vcpu_id) +{ + void *retval; + int r; + + r = pthread_cancel(thread); + TEST_ASSERT(r == 0, + "pthread_cancel on vcpu_id=%d failed with errno=%d", + vcpu_id, r); + + r = pthread_join(thread, &retval); + TEST_ASSERT(r == 0, + "pthread_join on vcpu_id=%d failed with errno=%d", + vcpu_id, r); + TEST_ASSERT(retval == PTHREAD_CANCELED, + "expected retval=%p, got %p", PTHREAD_CANCELED, + retval); +} + +void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, + uint64_t *pipis_rcvd) +{ + long pages_not_moved; + unsigned long nodemask = 0; + unsigned long nodemasks[sizeof(nodemask) * 8]; + int nodes = 0; + time_t start_time, last_update, now; + time_t interval_secs = 1; + int i, r; + int from, to; + unsigned long bit; + uint64_t hlt_count; + uint64_t wake_count; + uint64_t ipis_sent; + + fprintf(stderr, "Calling migrate_pages every %d microseconds\n", + delay_usecs); + + /* Get set of first 64 numa nodes available */ + r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + 0, MPOL_F_MEMS_ALLOWED); + TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); + + fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " + "(each 1-bit indicates node is present): %#lx\n", + sizeof(nodemask) * 8, nodemask); + + /* Init array of masks containing a single-bit in each, one for each + * available node. migrate_pages called below requires specifying nodes + * as bit masks. + */ + for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) { + if (nodemask & bit) { + nodemasks[nodes] = nodemask & bit; + nodes++; + } + } + + TEST_ASSERT(nodes > 1, + "Did not find at least 2 numa nodes. Can't do migration\n"); + + fprintf(stderr, "Migrating amongst %d nodes found\n", nodes); + + from = 0; + to = 1; + start_time = time(NULL); + last_update = start_time; + + ipis_sent = data->ipis_sent; + hlt_count = data->hlt_count; + wake_count = data->wake_count; + + while ((int)(time(NULL) - start_time) < run_secs) { + data->migrations_attempted++; + + /* + * migrate_pages with PID=0 will migrate all pages of this + * process between the nodes specified as bitmasks. The page + * backing the APIC access address belongs to this process + * because it is allocated by KVM in the context of the + * KVM_CREATE_VCPU ioctl. If that assumption ever changes this + * test may break or give a false positive signal. + */ + pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]), + &nodemasks[from], + &nodemasks[to]); + if (pages_not_moved < 0) + fprintf(stderr, + "migrate_pages failed, errno=%d\n", errno); + else if (pages_not_moved > 0) + fprintf(stderr, + "migrate_pages could not move %ld pages\n", + pages_not_moved); + else + data->migrations_completed++; + + from = to; + to++; + if (to == nodes) + to = 0; + + now = time(NULL); + if (((now - start_time) % interval_secs == 0) && + (now != last_update)) { + last_update = now; + fprintf(stderr, + "%lu seconds: Migrations attempted=%lu completed=%lu, " + "IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n", + now - start_time, data->migrations_attempted, + data->migrations_completed, + data->ipis_sent, *pipis_rcvd, + data->hlt_count, data->wake_count); + + TEST_ASSERT(ipis_sent != data->ipis_sent && + hlt_count != data->hlt_count && + wake_count != data->wake_count, + "IPI, HLT and wake count have not increased " + "in the last %lu seconds. " + "HLTer is likely hung.\n", interval_secs); + + ipis_sent = data->ipis_sent; + hlt_count = data->hlt_count; + wake_count = data->wake_count; + } + usleep(delay_usecs); + } +} + +void get_cmdline_args(int argc, char *argv[], int *run_secs, + bool *migrate, int *delay_usecs) +{ + for (;;) { + int opt = getopt(argc, argv, "s:d:m"); + + if (opt == -1) + break; + switch (opt) { + case 's': + *run_secs = parse_size(optarg); + break; + case 'm': + *migrate = true; + break; + case 'd': + *delay_usecs = parse_size(optarg); + break; + default: + TEST_ASSERT(false, + "Usage: -s <runtime seconds>. Default is %d seconds.\n" + "-m adds calls to migrate_pages while vCPUs are running." + " Default is no migrations.\n" + "-d <delay microseconds> - delay between migrate_pages() calls." + " Default is %d microseconds.\n", + DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS); + } + } +} + +int main(int argc, char *argv[]) +{ + int r; + int wait_secs; + const int max_halter_wait = 10; + int run_secs = 0; + int delay_usecs = 0; + struct test_data_page *data; + vm_vaddr_t test_data_page_vaddr; + bool migrate = false; + pthread_t threads[2]; + struct thread_params params[2]; + struct kvm_vm *vm; + uint64_t *pipis_rcvd; + + get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs); + if (run_secs <= 0) + run_secs = DEFAULT_RUN_SECS; + if (delay_usecs <= 0) + delay_usecs = DEFAULT_DELAY_USECS; + + vm = vm_create_default(HALTER_VCPU_ID, 0, halter_guest_code); + params[0].vm = vm; + params[1].vm = vm; + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, HALTER_VCPU_ID); + vm_handle_exception(vm, IPI_VECTOR, guest_ipi_handler); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA, 0); + + vm_vcpu_add_default(vm, SENDER_VCPU_ID, sender_guest_code); + + test_data_page_vaddr = vm_vaddr_alloc(vm, 0x1000, 0x1000, 0, 0); + data = + (struct test_data_page *)addr_gva2hva(vm, test_data_page_vaddr); + memset(data, 0, sizeof(*data)); + params[0].data = data; + params[1].data = data; + + vcpu_args_set(vm, HALTER_VCPU_ID, 1, test_data_page_vaddr); + vcpu_args_set(vm, SENDER_VCPU_ID, 1, test_data_page_vaddr); + + pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd); + params[0].pipis_rcvd = pipis_rcvd; + params[1].pipis_rcvd = pipis_rcvd; + + /* Start halter vCPU thread and wait for it to execute first HLT. */ + params[0].vcpu_id = HALTER_VCPU_ID; + r = pthread_create(&threads[0], NULL, vcpu_thread, ¶ms[0]); + TEST_ASSERT(r == 0, + "pthread_create halter failed errno=%d", errno); + fprintf(stderr, "Halter vCPU thread started\n"); + + wait_secs = 0; + while ((wait_secs < max_halter_wait) && !data->hlt_count) { + sleep(1); + wait_secs++; + } + + TEST_ASSERT(data->hlt_count, + "Halter vCPU did not execute first HLT within %d seconds", + max_halter_wait); + + fprintf(stderr, + "Halter vCPU thread reported its APIC ID: %u after %d seconds.\n", + data->halter_apic_id, wait_secs); + + params[1].vcpu_id = SENDER_VCPU_ID; + r = pthread_create(&threads[1], NULL, vcpu_thread, ¶ms[1]); + TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno); + + fprintf(stderr, + "IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n", + run_secs); + + if (!migrate) + sleep(run_secs); + else + do_migrations(data, run_secs, delay_usecs, pipis_rcvd); + + /* + * Cancel threads and wait for them to stop. + */ + cancel_join_vcpu_thread(threads[0], HALTER_VCPU_ID); + cancel_join_vcpu_thread(threads[1], SENDER_VCPU_ID); + + fprintf(stderr, + "Test successful after running for %d seconds.\n" + "Sending vCPU sent %lu IPIs to halting vCPU\n" + "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n" + "Halter APIC ID=%#x\n" + "Sender ICR value=%#x ICR2 value=%#x\n" + "Halter TPR=%#x PPR=%#x LVR=%#x\n" + "Migrations attempted: %lu\n" + "Migrations completed: %lu\n", + run_secs, data->ipis_sent, + data->hlt_count, data->wake_count, *pipis_rcvd, + data->halter_apic_id, + data->icr, data->icr2, + data->halter_tpr, data->halter_ppr, data->halter_lvr, + data->migrations_attempted, data->migrations_completed); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c new file mode 100644 index 000000000000..9246ea310587 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_vmcall_test + * + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + * + * Xen shared_info / pvclock testing + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include <stdint.h> +#include <time.h> + +#define VCPU_ID 5 + +#define SHINFO_REGION_GPA 0xc0000000ULL +#define SHINFO_REGION_SLOT 10 +#define PAGE_SIZE 4096 + +#define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) + +static struct kvm_vm *vm; + +#define XEN_HYPERCALL_MSR 0x40000000 + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +static void guest_code(void) +{ + GUEST_DONE(); +} + +static int cmp_timespec(struct timespec *a, struct timespec *b) +{ + if (a->tv_sec > b->tv_sec) + return 1; + else if (a->tv_sec < b->tv_sec) + return -1; + else if (a->tv_nsec > b->tv_nsec) + return 1; + else if (a->tv_nsec < b->tv_nsec) + return -1; + else + return 0; +} + +int main(int argc, char *argv[]) +{ + struct timespec min_ts, max_ts, vm_ts; + + if (!(kvm_check_cap(KVM_CAP_XEN_HVM) & + KVM_XEN_HVM_CONFIG_SHARED_INFO) ) { + print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available"); + exit(KSFT_SKIP); + } + + clock_gettime(CLOCK_REALTIME, &min_ts); + + vm = vm_create_default(VCPU_ID, 0, (void *) guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* Map a region for the shared_info page */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0); + + struct kvm_xen_hvm_config hvmc = { + .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, + .msr = XEN_HYPERCALL_MSR, + }; + vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc); + + struct kvm_xen_hvm_attr lm = { + .type = KVM_XEN_ATTR_TYPE_LONG_MODE, + .u.long_mode = 1, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + + struct kvm_xen_hvm_attr ha = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha); + + struct kvm_xen_vcpu_attr vi = { + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, + .u.gpa = SHINFO_REGION_GPA + 0x40, + }; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &vi); + + struct kvm_xen_vcpu_attr pvclock = { + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, + .u.gpa = PVTIME_ADDR, + }; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } + + done: + clock_gettime(CLOCK_REALTIME, &max_ts); + + /* + * Just a *really* basic check that things are being put in the + * right place. The actual calculations are much the same for + * Xen as they are for the KVM variants, so no need to check. + */ + struct pvclock_wall_clock *wc; + struct pvclock_vcpu_time_info *ti, *ti2; + + wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00); + ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); + ti2 = addr_gpa2hva(vm, PVTIME_ADDR); + + vm_ts.tv_sec = wc->sec; + vm_ts.tv_nsec = wc->nsec; + TEST_ASSERT(wc->version && !(wc->version & 1), + "Bad wallclock version %x", wc->version); + TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); + TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); + + TEST_ASSERT(ti->version && !(ti->version & 1), + "Bad time_info version %x", ti->version); + TEST_ASSERT(ti2->version && !(ti2->version & 1), + "Bad time_info version %x", ti->version); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c new file mode 100644 index 000000000000..8389e0bfd711 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * xen_vmcall_test + * + * Copyright © 2020 Amazon.com, Inc. or its affiliates. + * + * Userspace hypercall testing + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 5 + +#define HCALL_REGION_GPA 0xc0000000ULL +#define HCALL_REGION_SLOT 10 +#define PAGE_SIZE 4096 + +static struct kvm_vm *vm; + +#define INPUTVALUE 17 +#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x) +#define RETVALUE 0xcafef00dfbfbffffUL + +#define XEN_HYPERCALL_MSR 0x40000200 +#define HV_GUEST_OS_ID_MSR 0x40000000 +#define HV_HYPERCALL_MSR 0x40000001 + +#define HVCALL_SIGNAL_EVENT 0x005d +#define HV_STATUS_INVALID_ALIGNMENT 4 + +static void guest_code(void) +{ + unsigned long rax = INPUTVALUE; + unsigned long rdi = ARGVALUE(1); + unsigned long rsi = ARGVALUE(2); + unsigned long rdx = ARGVALUE(3); + unsigned long rcx; + register unsigned long r10 __asm__("r10") = ARGVALUE(4); + register unsigned long r8 __asm__("r8") = ARGVALUE(5); + register unsigned long r9 __asm__("r9") = ARGVALUE(6); + + /* First a direct invocation of 'vmcall' */ + __asm__ __volatile__("vmcall" : + "=a"(rax) : + "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx), + "r"(r10), "r"(r8), "r"(r9)); + GUEST_ASSERT(rax == RETVALUE); + + /* Fill in the Xen hypercall page */ + __asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR), + "a" (HCALL_REGION_GPA & 0xffffffff), + "d" (HCALL_REGION_GPA >> 32)); + + /* Set Hyper-V Guest OS ID */ + __asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR), + "a" (0x5a), "d" (0)); + + /* Hyper-V hypercall page */ + u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1; + __asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR), + "a" (msrval & 0xffffffff), + "d" (msrval >> 32)); + + /* Invoke a Xen hypercall */ + __asm__ __volatile__("call *%1" : "=a"(rax) : + "r"(HCALL_REGION_GPA + INPUTVALUE * 32), + "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx), + "r"(r10), "r"(r8), "r"(r9)); + GUEST_ASSERT(rax == RETVALUE); + + /* Invoke a Hyper-V hypercall */ + rax = 0; + rcx = HVCALL_SIGNAL_EVENT; /* code */ + rdx = 0x5a5a5a5a; /* ingpa (badly aligned) */ + __asm__ __volatile__("call *%1" : "=a"(rax) : + "r"(HCALL_REGION_GPA + PAGE_SIZE), + "a"(rax), "c"(rcx), "d"(rdx), + "r"(r8)); + GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + if (!(kvm_check_cap(KVM_CAP_XEN_HVM) & + KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) ) { + print_skip("KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL not available"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, (void *) guest_code); + vcpu_set_hv_cpuid(vm, VCPU_ID); + + struct kvm_xen_hvm_config hvmc = { + .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, + .msr = XEN_HYPERCALL_MSR, + }; + vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc); + + /* Map a region for the hypercall pages */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0); + virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2, 0); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_XEN) { + ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL); + ASSERT_EQ(run->xen.u.hcall.cpl, 0); + ASSERT_EQ(run->xen.u.hcall.longmode, 1); + ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE); + ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1)); + ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2)); + ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3)); + ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4)); + ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5)); + ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6)); + run->xen.u.hcall.result = RETVALUE; + continue; + } + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 9d01299563ee..7aafefc50aa7 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -9,6 +9,7 @@ #include <linux/vmalloc.h> #include <linux/kvm_dirty_ring.h> #include <trace/events/kvm.h> +#include "mmu_lock.h" int __weak kvm_cpu_dirty_log_size(void) { @@ -60,17 +61,16 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) if (!memslot || (offset + __fls(mask)) >= memslot->npages) return; - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); } int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) { - ring->dirty_gfns = vmalloc(size); + ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; - memset(ring->dirty_gfns, 0, size); ring->size = size / sizeof(struct kvm_dirty_gfn); ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8367d88ce39b..001b9de4e727 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -58,6 +58,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "mmu_lock.h" #include "vfio.h" #define CREATE_TRACE_POINTS @@ -459,13 +460,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, int idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + + KVM_MMU_LOCK(kvm); + kvm->mmu_notifier_seq++; if (kvm_set_spte_hva(kvm, address, pte)) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); } @@ -476,7 +479,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, int need_tlb_flush = 0, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and @@ -489,7 +492,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, if (need_tlb_flush || kvm->tlbs_dirty) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return 0; @@ -500,7 +503,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have @@ -514,7 +517,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, * in conjunction with the smp_rmb in mmu_notifier_retry(). */ kvm->mmu_notifier_count--; - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); BUG_ON(kvm->mmu_notifier_count < 0); } @@ -528,13 +531,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, int young, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; @@ -549,7 +552,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, int young, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); /* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is @@ -564,7 +567,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * more sophisticated heuristic later. */ young = kvm_age_hva(kvm, start, end); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; @@ -578,9 +581,9 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, int young, idx; idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); young = kvm_test_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); return young; @@ -745,7 +748,7 @@ static struct kvm *kvm_create_vm(unsigned long type) if (!kvm) return ERR_PTR(-ENOMEM); - spin_lock_init(&kvm->mmu_lock); + KVM_MMU_LOCK_INIT(kvm); mmgrab(current->mm); kvm->mm = current->mm; kvm_eventfd_init(kvm); @@ -1525,7 +1528,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); memset(dirty_bitmap_buffer, 0, n); - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1541,7 +1544,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); } if (flush) @@ -1636,7 +1639,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) return -EFAULT; - spin_lock(&kvm->mmu_lock); + KVM_MMU_LOCK(kvm); for (offset = log->first_page, i = offset / BITS_PER_LONG, n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { @@ -1659,7 +1662,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, offset, mask); } } - spin_unlock(&kvm->mmu_lock); + KVM_MMU_UNLOCK(kvm); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -1903,10 +1906,12 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, bool write_fault, bool *writable, kvm_pfn_t *p_pfn) { - unsigned long pfn; + kvm_pfn_t pfn; + pte_t *ptep; + spinlock_t *ptl; int r; - r = follow_pfn(vma, addr, &pfn); + r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does @@ -1921,14 +1926,19 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pfn(vma, addr, &pfn); + r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); if (r) return r; + } + if (write_fault && !pte_write(*ptep)) { + pfn = KVM_PFN_ERR_RO_FAULT; + goto out; } if (writable) - *writable = true; + *writable = pte_write(*ptep); + pfn = pte_pfn(*ptep); /* * Get a reference here because callers of *hva_to_pfn* and @@ -1943,6 +1953,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, */ kvm_get_pfn(pfn); +out: + pte_unmap_unlock(ptep, ptl); *p_pfn = pfn; return 0; } diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h new file mode 100644 index 000000000000..9e1308f9734c --- /dev/null +++ b/virt/kvm/mmu_lock.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_MMU_LOCK_H +#define KVM_MMU_LOCK_H 1 + +/* + * Architectures can choose whether to use an rwlock or spinlock + * for the mmu_lock. These macros, for use in common code + * only, avoids using #ifdefs in places that must deal with + * multiple architectures. + */ + +#ifdef KVM_HAVE_MMU_RWLOCK +#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) +#else +#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#endif /* KVM_HAVE_MMU_RWLOCK */ + +#endif |