diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-27 00:20:14 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-27 00:20:14 +0300 |
commit | bf9095424d027e942e1d1ee74977e17b7df8e455 (patch) | |
tree | 57659cf68b7df09005bc5ada4d315d66472cebf3 /arch | |
parent | 98931dd95fd489fcbfa97da563505a6f071d7c77 (diff) | |
parent | ffd1925a596ce68bed7d81c61cb64bc35f788a9d (diff) | |
download | linux-bf9095424d027e942e1d1ee74977e17b7df8e455.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"S390:
- ultravisor communication device driver
- fix TEID on terminating storage key ops
RISC-V:
- Added Sv57x4 support for G-stage page table
- Added range based local HFENCE functions
- Added remote HFENCE functions based on VCPU requests
- Added ISA extension registers in ONE_REG interface
- Updated KVM RISC-V maintainers entry to cover selftests support
ARM:
- Add support for the ARMv8.6 WFxT extension
- Guard pages for the EL2 stacks
- Trap and emulate AArch32 ID registers to hide unsupported features
- Ability to select and save/restore the set of hypercalls exposed to
the guest
- Support for PSCI-initiated suspend in collaboration with userspace
- GICv3 register-based LPI invalidation support
- Move host PMU event merging into the vcpu data structure
- GICv3 ITS save/restore fixes
- The usual set of small-scale cleanups and fixes
x86:
- New ioctls to get/set TSC frequency for a whole VM
- Allow userspace to opt out of hypercall patching
- Only do MSR filtering for MSRs accessed by rdmsr/wrmsr
AMD SEV improvements:
- Add KVM_EXIT_SHUTDOWN metadata for SEV-ES
- V_TSC_AUX support
Nested virtualization improvements for AMD:
- Support for "nested nested" optimizations (nested vVMLOAD/VMSAVE,
nested vGIF)
- Allow AVIC to co-exist with a nested guest running
- Fixes for LBR virtualizations when a nested guest is running, and
nested LBR virtualization support
- PAUSE filtering for nested hypervisors
Guest support:
- Decoupling of vcpu_is_preempted from PV spinlocks"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (199 commits)
KVM: x86: Fix the intel_pt PMI handling wrongly considered from guest
KVM: selftests: x86: Sync the new name of the test case to .gitignore
Documentation: kvm: reorder ARM-specific section about KVM_SYSTEM_EVENT_SUSPEND
x86, kvm: use correct GFP flags for preemption disabled
KVM: LAPIC: Drop pending LAPIC timer injection when canceling the timer
x86/kvm: Alloc dummy async #PF token outside of raw spinlock
KVM: x86: avoid calling x86 emulator without a decoded instruction
KVM: SVM: Use kzalloc for sev ioctl interfaces to prevent kernel data leak
x86/fpu: KVM: Set the base guest FPU uABI size to sizeof(struct kvm_xsave)
s390/uv_uapi: depend on CONFIG_S390
KVM: selftests: x86: Fix test failure on arch lbr capable platforms
KVM: LAPIC: Trace LAPIC timer expiration on every vmentry
KVM: s390: selftest: Test suppression indication on key prot exception
KVM: s390: Don't indicate suppression on dirtying, failing memop
selftests: drivers/s390x: Add uvdevice tests
drivers/s390/char: Add Ultravisor io device
MAINTAINERS: Update KVM RISC-V entry to cover selftests support
RISC-V: KVM: Introduce ISA extension register
RISC-V: KVM: Cleanup stale TLB entries when host CPU changes
RISC-V: KVM: Add remote HFENCE functions based on VCPU requests
...
Diffstat (limited to 'arch')
97 files changed, 5238 insertions, 2053 deletions
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 62217be36217..9f3e2c3d2ca0 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -16,7 +16,11 @@ #define sev() asm volatile("sev" : : : "memory") #define wfe() asm volatile("wfe" : : : "memory") +#define wfet(val) asm volatile("msr s0_3_c1_c0_0, %0" \ + : : "r" (val) : "memory") #define wfi() asm volatile("wfi" : : : "memory") +#define wfit(val) asm volatile("msr s0_3_c1_c0_1, %0" \ + : : "r" (val) : "memory") #define isb() asm volatile("isb" : : : "memory") #define dmb(opt) asm volatile("dmb " #opt : : : "memory") diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 92331c07c2d1..8aa0d276a636 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -118,6 +118,10 @@ #define APPLE_CPU_PART_M1_ICESTORM 0x022 #define APPLE_CPU_PART_M1_FIRESTORM 0x023 +#define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 +#define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 +#define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 +#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -164,6 +168,10 @@ #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) +#define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO) +#define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) +#define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX) +#define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 8f236de7359c..15b34fbfca66 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -135,7 +135,10 @@ #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) -#define ESR_ELx_WFx_ISS_TI (UL(1) << 0) +#define ESR_ELx_WFx_ISS_RN (UL(0x1F) << 5) +#define ESR_ELx_WFx_ISS_RV (UL(1) << 2) +#define ESR_ELx_WFx_ISS_TI (UL(3) << 0) +#define ESR_ELx_WFx_ISS_WFxT (UL(2) << 0) #define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((UL(1) << 16) - 1) @@ -148,7 +151,8 @@ #define DISR_EL1_ESR_MASK (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC) /* ESR value templates for specific events */ -#define ESR_ELx_WFx_MASK (ESR_ELx_EC_MASK | ESR_ELx_WFx_ISS_TI) +#define ESR_ELx_WFx_MASK (ESR_ELx_EC_MASK | \ + (ESR_ELx_WFx_ISS_TI & ~ESR_ELx_WFx_ISS_WFxT)) #define ESR_ELx_WFx_WFI_VAL ((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) | \ ESR_ELx_WFx_ISS_WFI) diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 9f0ce004fdbc..aa443d8f8cfb 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -117,6 +117,7 @@ #define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32) #define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32) #define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64) +#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 13ae232ec4a1..8aa8492dafc0 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -80,11 +80,12 @@ * FMO: Override CPSR.F and enable signaling with VF * SWIO: Turn set/way invalidates into set/way clean+invalidate * PTW: Take a stage2 fault if a stage1 walk steps in device memory + * TID3: Trap EL1 reads of group 3 ID registers */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ - HCR_FMO | HCR_IMO | HCR_PTW ) + HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index d5b0386ef765..2e277f2ed671 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -169,6 +169,7 @@ struct kvm_nvhe_init_params { unsigned long tcr_el2; unsigned long tpidr_el2; unsigned long stack_hyp_va; + unsigned long stack_pa; phys_addr_t pgd_pa; unsigned long hcr_el2; unsigned long vttbr; diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 08233172e7a9..0e66edd3aff2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -87,13 +87,6 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; - else - /* - * TID3: trap feature register accesses that we virtualise. - * For now this is conditional, since no AArch32 feature regs - * are currently virtualised. - */ - vcpu->arch.hcr_el2 |= HCR_TID3; if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || vcpu_el1_is_32bit(vcpu)) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d5888dedf02a..47a1e25e25bb 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -46,6 +46,7 @@ #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) +#define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -101,15 +102,25 @@ struct kvm_s2_mmu { struct kvm_arch_memory_slot { }; +/** + * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests + * + * @std_bmap: Bitmap of standard secure service calls + * @std_hyp_bmap: Bitmap of standard hypervisor service calls + * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls + */ +struct kvm_smccc_features { + unsigned long std_bmap; + unsigned long std_hyp_bmap; + unsigned long vendor_hyp_bmap; +}; + struct kvm_arch { struct kvm_s2_mmu mmu; /* VTCR_EL2 value for this VM */ u64 vtcr; - /* The maximum number of vCPUs depends on the used GIC model */ - int max_vcpus; - /* Interrupt controller */ struct vgic_dist vgic; @@ -136,6 +147,8 @@ struct kvm_arch { */ #define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3 #define KVM_ARCH_FLAG_EL1_32BIT 4 + /* PSCI SYSTEM_SUSPEND enabled for the guest */ +#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 unsigned long flags; @@ -150,6 +163,9 @@ struct kvm_arch { u8 pfr0_csv2; u8 pfr0_csv3; + + /* Hypercall features firmware registers' descriptor */ + struct kvm_smccc_features smccc_feat; }; struct kvm_vcpu_fault_info { @@ -254,14 +270,8 @@ struct kvm_cpu_context { struct kvm_vcpu *__hyp_running_vcpu; }; -struct kvm_pmu_events { - u32 events_host; - u32 events_guest; -}; - struct kvm_host_data { struct kvm_cpu_context host_ctxt; - struct kvm_pmu_events pmu_events; }; struct kvm_host_psci_config { @@ -368,8 +378,8 @@ struct kvm_vcpu_arch { u32 mdscr_el1; } guest_debug_preserved; - /* vcpu power-off state */ - bool power_off; + /* vcpu power state */ + struct kvm_mp_state mp_state; /* Don't run the guest (internal implementation need) */ bool pause; @@ -455,6 +465,7 @@ struct kvm_vcpu_arch { #define KVM_ARM64_FP_FOREIGN_FPSTATE (1 << 14) #define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ #define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ +#define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ @@ -687,10 +698,11 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); -void kvm_sys_reg_table_init(void); +int kvm_sys_reg_table_init(void); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); @@ -799,9 +811,6 @@ void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u32 clr); - -void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); -void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); #else static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} @@ -833,8 +842,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_has_mte(kvm) \ (system_supports_mte() && \ test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags)) -#define kvm_vcpu_has_pmu(vcpu) \ - (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM @@ -845,4 +852,7 @@ void __init kvm_hyp_reserve(void); static inline void kvm_hyp_reserve(void) { } #endif +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); +bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 74735a864eee..b208da3bebec 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -154,6 +154,9 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) int kvm_share_hyp(void *from, void *to); void kvm_unshare_hyp(void *from, void *to); int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); +int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot); +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index b0256cec63b5..4bb2cc8ac446 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -87,5 +87,6 @@ #define HWCAP2_SME_B16F32 (1 << 28) #define HWCAP2_SME_F32F32 (1 << 29) #define HWCAP2_SME_FA64 (1 << 30) +#define HWCAP2_WFXT (1UL << 31) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index ab585359242d..3bb134355874 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -334,6 +334,40 @@ struct kvm_arm_copy_mte_tags { #define KVM_ARM64_SVE_VLS_WORDS \ ((KVM_ARM64_SVE_VQ_MAX - KVM_ARM64_SVE_VQ_MIN) / 64 + 1) +/* Bitmap feature firmware registers */ +#define KVM_REG_ARM_FW_FEAT_BMAP (0x0016 << KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_FW_FEAT_BMAP_REG(r) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ + KVM_REG_ARM_FW_FEAT_BMAP | \ + ((r) & 0xffff)) + +#define KVM_REG_ARM_STD_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(0) + +enum { + KVM_REG_ARM_STD_BIT_TRNG_V1_0 = 0, +#ifdef __KERNEL__ + KVM_REG_ARM_STD_BMAP_BIT_COUNT, +#endif +}; + +#define KVM_REG_ARM_STD_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(1) + +enum { + KVM_REG_ARM_STD_HYP_BIT_PV_TIME = 0, +#ifdef __KERNEL__ + KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT, +#endif +}; + +#define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2) + +enum { + KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT = 0, + KVM_REG_ARM_VENDOR_HYP_BIT_PTP = 1, +#ifdef __KERNEL__ + KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT, +#endif +}; + /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4ccddf382e5b..42ea2bd856c6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -237,6 +237,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_GPA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_WFXT_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -2517,6 +2518,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = fa64_kernel_enable, }, #endif /* CONFIG_ARM64_SME */ + { + .desc = "WFx with timeout", + .capability = ARM64_HAS_WFXT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR2_WFXT_SHIFT, + .field_width = 4, + .matches = has_cpuid_feature, + .min_field_value = ID_AA64ISAR2_WFXT_SUPPORTED, + }, {}, }; @@ -2650,6 +2662,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP), HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES), + HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_WFXT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_WFXT_SUPPORTED, CAP_HWCAP, KERNEL_HWCAP_WFXT), #ifdef CONFIG_ARM64_SME HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME), HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_FA64, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 8a8136a096ac..8eff0a34ffd4 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -106,6 +106,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", [KERNEL_HWCAP_SME_F32F32] = "smef32f32", [KERNEL_HWCAP_SME_FA64] = "smefa64", + [KERNEL_HWCAP_WFXT] = "wfxt", }; #ifdef CONFIG_COMPAT diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 261644b1a6bb..aa127ae9f675 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ - vgic-sys-reg-v3.o fpsimd.o pmu.o pkvm.o \ + vgic-sys-reg-v3.o fpsimd.o pkvm.o \ arch_timer.o trng.o vmid.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ @@ -22,7 +22,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ vgic/vgic-its.o vgic/vgic-debug.o -kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o +kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o always-y := hyp_constants.h hyp-constants.s diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 6e542e2eae32..4e39ace073af 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -208,18 +208,16 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) return IRQ_HANDLED; } -static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx, + u64 val) { - u64 cval, now; - - cval = timer_get_cval(timer_ctx); - now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); + u64 now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); - if (now < cval) { + if (now < val) { u64 ns; ns = cyclecounter_cyc2ns(timecounter->cc, - cval - now, + val - now, timecounter->mask, &timecounter->frac); return ns; @@ -228,6 +226,11 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) return 0; } +static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) +{ + return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx)); +} + static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { WARN_ON(timer_ctx && timer_ctx->loaded); @@ -236,6 +239,20 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) (ARCH_TIMER_CTRL_IT_MASK | ARCH_TIMER_CTRL_ENABLE)) == ARCH_TIMER_CTRL_ENABLE); } +static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) +{ + return (cpus_have_final_cap(ARM64_HAS_WFXT) && + (vcpu->arch.flags & KVM_ARM64_WFIT)); +} + +static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *ctx = vcpu_vtimer(vcpu); + u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + return kvm_counter_compute_delta(ctx, val); +} + /* * Returns the earliest expiration time in ns among guest timers. * Note that it will return 0 if none of timers can fire. @@ -253,6 +270,9 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); } + if (vcpu_has_wfit_active(vcpu)) + min_delta = min(min_delta, wfit_delay_ns(vcpu)); + /* If none of timers can fire, then return 0 */ if (min_delta == ULLONG_MAX) return 0; @@ -350,15 +370,9 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) return cval <= now; } -bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - struct timer_map map; - - get_timer_map(vcpu, &map); - - return kvm_timer_should_fire(map.direct_vtimer) || - kvm_timer_should_fire(map.direct_ptimer) || - kvm_timer_should_fire(map.emul_ptimer); + return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0; } /* @@ -484,7 +498,8 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) */ if (!kvm_timer_irq_can_fire(map.direct_vtimer) && !kvm_timer_irq_can_fire(map.direct_ptimer) && - !kvm_timer_irq_can_fire(map.emul_ptimer)) + !kvm_timer_irq_can_fire(map.emul_ptimer) && + !vcpu_has_wfit_active(vcpu)) return; /* diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index cedc3ba2c098..400bb0fe2745 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -97,6 +97,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SYSTEM_SUSPEND: + r = 0; + set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); + break; default: r = -EINVAL; break; @@ -153,9 +157,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_vgic_early_init(kvm); /* The maximum number of VCPUs is limited by the host's GIC model */ - kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); + kvm->max_vcpus = kvm_arm_default_max_vcpus(); set_default_spectre(kvm); + kvm_arm_init_hypercalls(kvm); return ret; out_free_stage2_pgd: @@ -210,6 +215,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: + case KVM_CAP_ARM_SYSTEM_SUSPEND: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -230,7 +236,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: if (kvm) - r = kvm->arch.max_vcpus; + r = kvm->max_vcpus; else r = kvm_arm_default_max_vcpus(); break; @@ -306,7 +312,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) return -EBUSY; - if (id >= kvm->arch.max_vcpus) + if (id >= kvm->max_vcpus) return -EINVAL; return 0; @@ -356,11 +362,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_arm_vcpu_destroy(vcpu); } -int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -{ - return kvm_timer_is_pending(vcpu); -} - void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { @@ -432,20 +433,34 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void vcpu_power_off(struct kvm_vcpu *vcpu) +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = true; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } +bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED; +} + +static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED; + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + kvm_vcpu_kick(vcpu); +} + +static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + *mp_state = vcpu->arch.mp_state; return 0; } @@ -457,10 +472,13 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; + vcpu->arch.mp_state = *mp_state; break; case KVM_MP_STATE_STOPPED: - vcpu_power_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); + break; + case KVM_MP_STATE_SUSPENDED: + kvm_arm_vcpu_suspend(vcpu); break; default: ret = -EINVAL; @@ -480,7 +498,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off && !v->arch.pause); + && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); } bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) @@ -592,15 +610,15 @@ void kvm_arm_resume_guest(struct kvm *kvm) } } -static void vcpu_req_sleep(struct kvm_vcpu *vcpu) +static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); rcuwait_wait_event(wait, - (!vcpu->arch.power_off) &&(!vcpu->arch.pause), + (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); - if (vcpu->arch.power_off || vcpu->arch.pause) { + if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { /* Awaken to handle a signal, request we sleep again later. */ kvm_make_request(KVM_REQ_SLEEP, vcpu); } @@ -639,6 +657,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_enable(); kvm_vcpu_halt(vcpu); + vcpu->arch.flags &= ~KVM_ARM64_WFIT; kvm_clear_request(KVM_REQ_UNHALT, vcpu); preempt_disable(); @@ -646,11 +665,53 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_enable(); } -static void check_vcpu_requests(struct kvm_vcpu *vcpu) +static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + if (!kvm_arm_vcpu_suspended(vcpu)) + return 1; + + kvm_vcpu_wfi(vcpu); + + /* + * The suspend state is sticky; we do not leave it until userspace + * explicitly marks the vCPU as runnable. Request that we suspend again + * later. + */ + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + + /* + * Check to make sure the vCPU is actually runnable. If so, exit to + * userspace informing it of the wakeup condition. + */ + if (kvm_arch_vcpu_runnable(vcpu)) { + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + return 0; + } + + /* + * Otherwise, we were unblocked to process a different event, such as a + * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to + * process the event. + */ + return 1; +} + +/** + * check_vcpu_requests - check and handle pending vCPU requests + * @vcpu: the VCPU pointer + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + * < 0 if we should exit to userspace, where the return value indicates + * an error + */ +static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) - vcpu_req_sleep(vcpu); + kvm_vcpu_sleep(vcpu); if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) kvm_reset_vcpu(vcpu); @@ -675,7 +736,12 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) kvm_pmu_handle_pmcr(vcpu, __vcpu_sys_reg(vcpu, PMCR_EL0)); + + if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) + return kvm_vcpu_suspend(vcpu); } + + return 1; } static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) @@ -792,7 +858,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (!ret) ret = 1; - check_vcpu_requests(vcpu); + if (ret > 0) + ret = check_vcpu_requests(vcpu); /* * Preparing the interrupts to be injected also @@ -816,6 +883,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vgic_flush_hwstate(vcpu); + kvm_pmu_update_vcpu_events(vcpu); + /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -1125,9 +1194,9 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, * Handle the "start in power-off" case. */ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu_power_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); else - vcpu->arch.power_off = false; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; return 0; } @@ -1485,7 +1554,6 @@ static void cpu_prepare_hyp_mode(int cpu) tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; params->tcr_el2 = tcr; - params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE); params->pgd_pa = kvm_mmu_get_httbr(); if (is_protected_kvm_enabled()) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; @@ -1763,8 +1831,6 @@ static int init_subsystems(void) kvm_register_perf_callbacks(NULL); - kvm_sys_reg_table_init(); - out: if (err || !is_protected_kvm_enabled()) on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); @@ -1935,14 +2001,46 @@ static int init_hyp_mode(void) * Map the Hyp stack pages */ for_each_possible_cpu(cpu) { + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, - PAGE_HYP); + unsigned long hyp_addr; + + /* + * Allocate a contiguous HYP private VA range for the stack + * and guard page. The allocation is also aligned based on + * the order of its size. + */ + err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); + if (err) { + kvm_err("Cannot allocate hyp stack guard page\n"); + goto out_err; + } + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the PAGE_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ + err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE, + __pa(stack_page), PAGE_HYP); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; } + + /* + * Save the stack PA in nvhe_init_params. This will be needed + * to recreate the stack mapping in protected nVHE mode. + * __hyp_pa() won't do the right thing there, since the stack + * has been mapped in the flexible private VA space. + */ + params->stack_pa = __pa(stack_page); + + params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } for_each_possible_cpu(cpu) { @@ -2091,6 +2189,12 @@ int kvm_arch_init(void *opaque) return -ENODEV; } + err = kvm_sys_reg_table_init(); + if (err) { + kvm_info("Error initializing system register tables"); + return err; + } + in_hyp_mode = is_kernel_in_hyp_mode(); if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 7e15b03fbdf8..8c607199cad1 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -18,7 +18,7 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/fs.h> -#include <kvm/arm_psci.h> +#include <kvm/arm_hypercalls.h> #include <asm/cputype.h> #include <linux/uaccess.h> #include <asm/fpsimd.h> @@ -756,7 +756,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg); - case KVM_REG_ARM_FW: return kvm_arm_get_fw_reg(vcpu, reg); + case KVM_REG_ARM_FW: + case KVM_REG_ARM_FW_FEAT_BMAP: + return kvm_arm_get_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg); } @@ -774,7 +776,9 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); - case KVM_REG_ARM_FW: return kvm_arm_set_fw_reg(vcpu, reg); + case KVM_REG_ARM_FW: + case KVM_REG_ARM_FW_FEAT_BMAP: + return kvm_arm_set_fw_reg(vcpu, reg); case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg); } diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 0b829292dc54..f66c0142b335 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -80,24 +80,51 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu) * * @vcpu: the vcpu pointer * - * WFE: Yield the CPU and come back to this vcpu when the scheduler + * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler * decides to. * WFI: Simply call kvm_vcpu_halt(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. + * WFIT: Same as WFI, with a timed wakeup implemented as a background timer + * + * WF{I,E}T can immediately return if the deadline has already expired. */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_WFx_ISS_WFE) { + u64 esr = kvm_vcpu_get_esr(vcpu); + + if (esr & ESR_ELx_WFx_ISS_WFE) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; - kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); } else { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; - kvm_vcpu_wfi(vcpu); } + if (esr & ESR_ELx_WFx_ISS_WFxT) { + if (esr & ESR_ELx_WFx_ISS_RV) { + u64 val, now; + + now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT); + val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + + if (now >= val) + goto out; + } else { + /* Treat WFxT as WFx if RN is invalid */ + esr &= ~ESR_ELx_WFx_ISS_WFxT; + } + } + + if (esr & ESR_ELx_WFx_ISS_WFE) { + kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); + } else { + if (esr & ESR_ELx_WFx_ISS_WFxT) + vcpu->arch.flags |= KVM_ARM64_WFIT; + + kvm_vcpu_wfi(vcpu); + } +out: kvm_incr_pc(vcpu); return 1; @@ -169,6 +196,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_CP15_64] = kvm_handle_cp15_64, [ESR_ELx_EC_CP14_MR] = kvm_handle_cp14_32, [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, + [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, [ESR_ELx_EC_HVC32] = handle_hvc, [ESR_ELx_EC_SMC32] = handle_smc, @@ -297,13 +325,8 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_in_kimg = __phys_to_kimg(elr_phys); u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; + u64 panic_addr = elr_virt + hyp_offset; - /* - * The nVHE hyp symbols are not included by kallsyms to avoid issues - * with aliasing. That means that the symbols cannot be printed with the - * "%pS" format specifier, so fall back to the vmlinux address if - * there's no better option. - */ if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && @@ -323,9 +346,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset); + kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr, + (void *)panic_addr); } else { - kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset); + kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr, + (void *)panic_addr); } /* diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 2d08510c6cc1..42d8eb9bfe72 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -19,8 +19,10 @@ int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); -unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, - enum kvm_pgtable_prot prot); +int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr); +int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, unsigned long *start, unsigned long *end) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 727c979b2b69..ea6a397b64a6 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -80,7 +80,7 @@ SYM_FUNC_START(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr - ldr lr, =nvhe_hyp_panic_handler + adr_l lr, nvhe_hyp_panic_handler hyp_kimg_va lr, x6 msr elr_el2, lr @@ -125,13 +125,11 @@ alternative_else_nop_endif add sp, sp, #16 /* * Compute the idmap address of __kvm_handle_stub_hvc and - * jump there. Since we use kimage_voffset, do not use the - * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead - * (by loading it from the constant pool). + * jump there. * * Preserve x0-x4, which may contain stub parameters. */ - ldr x5, =__kvm_handle_stub_hvc + adr_l x5, __kvm_handle_stub_hvc hyp_pa x5, x6 br x5 SYM_FUNC_END(__host_hvc) @@ -153,6 +151,18 @@ SYM_FUNC_END(__host_hvc) .macro invalid_host_el2_vect .align 7 + + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * of SP should always be 1. + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + /* If a guest is loaded, panic out of it. */ stp x0, x1, [sp, #-16]! get_loaded_vcpu x0, x1 @@ -165,6 +175,18 @@ SYM_FUNC_END(__host_hvc) * been partially clobbered by __host_enter. */ b hyp_panic + +.L__hyp_sp_overflow\@: + /* + * Reset SP to the top of the stack, to allow handling the hyp_panic. + * This corrupts the stack but is ok, since we won't be attempting + * any unwinding here. + */ + ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1 + mov sp, x0 + + b hyp_panic_bad_stack + ASM_BUG() .endm .macro invalid_host_el1_vect diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 5e2197db0d32..3cea4b6ac23e 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -160,7 +160,23 @@ static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ct DECLARE_REG(size_t, size, host_ctxt, 2); DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); - cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot); + /* + * __pkvm_create_private_mapping() populates a pointer with the + * hypervisor start address of the allocation. + * + * However, handle___pkvm_create_private_mapping() hypercall crosses the + * EL1/EL2 boundary so the pointer would not be valid in this context. + * + * Instead pass the allocation address as the return value (or return + * ERR_PTR() on failure). + */ + unsigned long haddr; + int err = __pkvm_create_private_mapping(phys, size, prot, &haddr); + + if (err) + haddr = (unsigned long)ERR_PTR(err); + + cpu_reg(host_ctxt, 1) = haddr; } static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index cdbe8e246418..96193cb31a39 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -37,36 +37,60 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, return err; } -unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, - enum kvm_pgtable_prot prot) +/** + * pkvm_alloc_private_va_range - Allocates a private VA range. + * @size: The size of the VA range to reserve. + * @haddr: The hypervisor virtual start address of the allocation. + * + * The private virtual address (VA) range is allocated above __io_map_base + * and aligned based on the order of @size. + * + * Return: 0 on success or negative error code on failure. + */ +int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) { - unsigned long addr; - int err; + unsigned long base, addr; + int ret = 0; hyp_spin_lock(&pkvm_pgd_lock); - size = PAGE_ALIGN(size + offset_in_page(phys)); - addr = __io_map_base; - __io_map_base += size; + /* Align the allocation based on the order of its size */ + addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size)); - /* Are we overflowing on the vmemmap ? */ - if (__io_map_base > __hyp_vmemmap) { - __io_map_base -= size; - addr = (unsigned long)ERR_PTR(-ENOMEM); - goto out; - } + /* The allocated size is always a multiple of PAGE_SIZE */ + base = addr + PAGE_ALIGN(size); - err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot); - if (err) { - addr = (unsigned long)ERR_PTR(err); - goto out; + /* Are we overflowing on the vmemmap ? */ + if (!addr || base > __hyp_vmemmap) + ret = -ENOMEM; + else { + __io_map_base = base; + *haddr = addr; } - addr = addr + offset_in_page(phys); -out: hyp_spin_unlock(&pkvm_pgd_lock); - return addr; + return ret; +} + +int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot, + unsigned long *haddr) +{ + unsigned long addr; + int err; + + size = PAGE_ALIGN(size + offset_in_page(phys)); + err = pkvm_alloc_private_va_range(size, &addr); + if (err) + return err; + + err = __pkvm_create_mappings(addr, size, phys, prot); + if (err) + return err; + + *haddr = addr + offset_in_page(phys); + return err; } int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) @@ -146,7 +170,8 @@ int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot) int hyp_map_vectors(void) { phys_addr_t phys; - void *bp_base; + unsigned long bp_base; + int ret; if (!kvm_system_needs_idmapped_vectors()) { __hyp_bp_vect_base = __bp_harden_hyp_vecs; @@ -154,13 +179,12 @@ int hyp_map_vectors(void) } phys = __hyp_pa(__bp_harden_hyp_vecs); - bp_base = (void *)__pkvm_create_private_mapping(phys, - __BP_HARDEN_HYP_VECS_SZ, - PAGE_HYP_EXEC); - if (IS_ERR_OR_NULL(bp_base)) - return PTR_ERR(bp_base); + ret = __pkvm_create_private_mapping(phys, __BP_HARDEN_HYP_VECS_SZ, + PAGE_HYP_EXEC, &bp_base); + if (ret) + return ret; - __hyp_bp_vect_base = bp_base; + __hyp_bp_vect_base = (void *)bp_base; return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 27af337f9fea..e8d4ea2fcfa0 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -99,17 +99,42 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; for (i = 0; i < hyp_nr_cpus; i++) { + struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i); + unsigned long hyp_addr; + start = (void *)kern_hyp_va(per_cpu_base[i]); end = start + PAGE_ALIGN(hyp_percpu_size); ret = pkvm_create_mappings(start, end, PAGE_HYP); if (ret) return ret; - end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va; - start = end - PAGE_SIZE; - ret = pkvm_create_mappings(start, end, PAGE_HYP); + /* + * Allocate a contiguous HYP private VA range for the stack + * and guard page. The allocation is also aligned based on + * the order of its size. + */ + ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); + if (ret) + return ret; + + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the PAGE_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ + hyp_spin_lock(&pkvm_pgd_lock); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE, + PAGE_SIZE, params->stack_pa, PAGE_HYP); + hyp_spin_unlock(&pkvm_pgd_lock); if (ret) return ret; + + /* Update stack_hyp_va to end of the stack's private VA range */ + params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } /* diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index caace61ea459..6db801db8f27 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -150,16 +150,13 @@ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) } } -/** +/* * Disable host events, enable guest events */ -static bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt) +#ifdef CONFIG_HW_PERF_EVENTS +static bool __pmu_switch_to_guest(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; - struct kvm_pmu_events *pmu; - - host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); - pmu = &host->pmu_events; + struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events; if (pmu->events_host) write_sysreg(pmu->events_host, pmcntenclr_el0); @@ -170,16 +167,12 @@ static bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt) return (pmu->events_host || pmu->events_guest); } -/** +/* * Disable guest events, enable host events */ -static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) +static void __pmu_switch_to_host(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; - struct kvm_pmu_events *pmu; - - host = container_of(host_ctxt, struct kvm_host_data, host_ctxt); - pmu = &host->pmu_events; + struct kvm_pmu_events *pmu = &vcpu->arch.pmu.events; if (pmu->events_guest) write_sysreg(pmu->events_guest, pmcntenclr_el0); @@ -187,8 +180,12 @@ static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) if (pmu->events_host) write_sysreg(pmu->events_host, pmcntenset_el0); } +#else +#define __pmu_switch_to_guest(v) ({ false; }) +#define __pmu_switch_to_host(v) do {} while (0) +#endif -/** +/* * Handler for protected VM MSR, MRS or System instruction execution in AArch64. * * Returns true if the hypervisor has handled the exit, and control should go @@ -205,23 +202,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } -/** - * Handler for protected floating-point and Advanced SIMD accesses. - * - * Returns true if the hypervisor has handled the exit, and control should go - * back to the guest, or false if it hasn't. - */ -static bool kvm_handle_pvm_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) -{ - /* Linux guests assume support for floating-point and Advanced SIMD. */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), - PVM_ID_AA64PFR0_ALLOW)); - - return kvm_hyp_handle_fpsimd(vcpu, exit_code); -} - static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, @@ -237,7 +217,7 @@ static const exit_handler_fn pvm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, - [ESR_ELx_EC_FP_ASIMD] = kvm_handle_pvm_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, @@ -304,7 +284,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; - pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); + pmu_switch_needed = __pmu_switch_to_guest(vcpu); __sysreg_save_state_nvhe(host_ctxt); /* @@ -366,7 +346,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __debug_restore_host_buffers_nvhe(vcpu); if (pmu_switch_needed) - __pmu_switch_to_host(host_ctxt); + __pmu_switch_to_host(vcpu); /* Returning to host will clear PSR.I, remask PMR if needed */ if (system_uses_irq_prio_masking()) @@ -377,7 +357,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return exit_code; } -void __noreturn hyp_panic(void) +asmlinkage void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); @@ -399,6 +379,11 @@ void __noreturn hyp_panic(void) unreachable(); } +asmlinkage void __noreturn hyp_panic_bad_stack(void) +{ + hyp_panic(); +} + asmlinkage void kvm_unexpected_el2_exception(void) { return __kvm_unexpected_el2_exception(); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 619f94fc95fa..b6d86e423319 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -90,9 +90,6 @@ static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) u64 set_mask = 0; u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; - if (!vcpu_has_sve(vcpu)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 202b8c455724..c9f401fa01a9 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -9,6 +9,13 @@ #include <kvm/arm_hypercalls.h> #include <kvm/arm_psci.h> +#define KVM_ARM_SMCCC_STD_FEATURES \ + GENMASK(KVM_REG_ARM_STD_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_STD_HYP_FEATURES \ + GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0) +#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES \ + GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0) + static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) { struct system_time_snapshot systime_snapshot; @@ -58,13 +65,73 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) val[3] = lower_32_bits(cycles); } +static bool kvm_hvc_call_default_allowed(u32 func_id) +{ + switch (func_id) { + /* + * List of function-ids that are not gated with the bitmapped + * feature firmware registers, and are to be allowed for + * servicing the call by default. + */ + case ARM_SMCCC_VERSION_FUNC_ID: + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + return true; + default: + /* PSCI 0.2 and up is in the 0:0x1f range */ + if (ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) <= 0x1f) + return true; + + /* + * KVM's PSCI 0.1 doesn't comply with SMCCC, and has + * its own function-id base and range + */ + if (func_id >= KVM_PSCI_FN(0) && func_id <= KVM_PSCI_FN(3)) + return true; + + return false; + } +} + +static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + + switch (func_id) { + case ARM_SMCCC_TRNG_VERSION: + case ARM_SMCCC_TRNG_FEATURES: + case ARM_SMCCC_TRNG_GET_UUID: + case ARM_SMCCC_TRNG_RND32: + case ARM_SMCCC_TRNG_RND64: + return test_bit(KVM_REG_ARM_STD_BIT_TRNG_V1_0, + &smccc_feat->std_bmap); + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + return test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME, + &smccc_feat->std_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT, + &smccc_feat->vendor_hyp_bmap); + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP, + &smccc_feat->vendor_hyp_bmap); + default: + return kvm_hvc_call_default_allowed(func_id); + } +} + int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) { + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; u32 func_id = smccc_get_function(vcpu); u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; u32 feature; gpa_t gpa; + if (!kvm_hvc_call_allowed(vcpu, func_id)) + goto out; + switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: val[0] = ARM_SMCCC_VERSION_1_1; @@ -120,7 +187,9 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val[0] = SMCCC_RET_SUCCESS; + if (test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME, + &smccc_feat->std_hyp_bmap)) + val[0] = SMCCC_RET_SUCCESS; break; } break; @@ -139,8 +208,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; break; case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: - val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); - val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP); + val[0] = smccc_feat->vendor_hyp_bmap; break; case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: kvm_ptp_get_time(vcpu, val); @@ -155,6 +223,259 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) return kvm_psci_call(vcpu); } +out: smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); return 1; } + +static const u64 kvm_arm_fw_reg_ids[] = { + KVM_REG_ARM_PSCI_VERSION, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, + KVM_REG_ARM_STD_BMAP, + KVM_REG_ARM_STD_HYP_BMAP, + KVM_REG_ARM_VENDOR_HYP_BMAP, +}; + +void kvm_arm_init_hypercalls(struct kvm *kvm) +{ + struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat; + + smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES; + smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES; + smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; +} + +int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) +{ + return ARRAY_SIZE(kvm_arm_fw_reg_ids); +} + +int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) { + if (put_user(kvm_arm_fw_reg_ids[i], uindices++)) + return -EFAULT; + } + + return 0; +} + +#define KVM_REG_FEATURE_LEVEL_MASK GENMASK(3, 0) + +/* + * Convert the workaround level into an easy-to-compare number, where higher + * values mean better protection. + */ +static int get_kernel_wa_level(u64 regid) +{ + switch (regid) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + switch (arm64_get_spectre_v2_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + switch (arm64_get_spectre_v4_state()) { + case SPECTRE_MITIGATED: + /* + * As for the hypercall discovery, we pretend we + * don't have any FW mitigation if SSBS is there at + * all times. + */ + if (cpus_have_final_cap(ARM64_SSBS)) + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + fallthrough; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + } + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + } + + return -EINVAL; +} + +int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + val = kvm_psci_version(vcpu); + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + break; + case KVM_REG_ARM_STD_BMAP: + val = READ_ONCE(smccc_feat->std_bmap); + break; + case KVM_REG_ARM_STD_HYP_BMAP: + val = READ_ONCE(smccc_feat->std_hyp_bmap); + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP: + val = READ_ONCE(smccc_feat->vendor_hyp_bmap); + break; + default: + return -ENOENT; + } + + if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) +{ + int ret = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat; + unsigned long *fw_reg_bmap, fw_reg_features; + + switch (reg_id) { + case KVM_REG_ARM_STD_BMAP: + fw_reg_bmap = &smccc_feat->std_bmap; + fw_reg_features = KVM_ARM_SMCCC_STD_FEATURES; + break; + case KVM_REG_ARM_STD_HYP_BMAP: + fw_reg_bmap = &smccc_feat->std_hyp_bmap; + fw_reg_features = KVM_ARM_SMCCC_STD_HYP_FEATURES; + break; + case KVM_REG_ARM_VENDOR_HYP_BMAP: + fw_reg_bmap = &smccc_feat->vendor_hyp_bmap; + fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; + break; + default: + return -ENOENT; + } + + /* Check for unsupported bit */ + if (val & ~fw_reg_features) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) && + val != *fw_reg_bmap) { + ret = -EBUSY; + goto out; + } + + WRITE_ONCE(*fw_reg_bmap, val); +out: + mutex_unlock(&kvm->lock); + return ret; +} + +int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + int wa_level; + + if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + switch (reg->id) { + case KVM_REG_ARM_PSCI_VERSION: + { + bool wants_02; + + wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); + + switch (val) { + case KVM_ARM_PSCI_0_1: + if (wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + case KVM_ARM_PSCI_0_2: + case KVM_ARM_PSCI_1_0: + case KVM_ARM_PSCI_1_1: + if (!wants_02) + return -EINVAL; + vcpu->kvm->arch.psci_version = val; + return 0; + } + break; + } + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + if (val & ~KVM_REG_FEATURE_LEVEL_MASK) + return -EINVAL; + + if (get_kernel_wa_level(reg->id) < val) + return -EINVAL; + + return 0; + + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) + return -EINVAL; + + /* The enabled bit must not be set unless the level is AVAIL. */ + if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) && + (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL) + return -EINVAL; + + /* + * Map all the possible incoming states to the only two we + * really want to deal with. + */ + switch (val & KVM_REG_FEATURE_LEVEL_MASK) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; + break; + default: + return -EINVAL; + } + + /* + * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the + * other way around. + */ + if (get_kernel_wa_level(reg->id) < wa_level) + return -EINVAL; + + return 0; + case KVM_REG_ARM_STD_BMAP: + case KVM_REG_ARM_STD_HYP_BMAP: + case KVM_REG_ARM_VENDOR_HYP_BMAP: + return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val); + default: + return -ENOENT; + } + + return -EINVAL; +} diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 5400fc020164..f5651a05b6a8 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -258,8 +258,8 @@ static bool kvm_host_owns_hyp_mappings(void) return true; } -static int __create_hyp_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot) +int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { int err; @@ -457,23 +457,22 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, - enum kvm_pgtable_prot prot) + +/** + * hyp_alloc_private_va_range - Allocates a private VA range. + * @size: The size of the VA range to reserve. + * @haddr: The hypervisor virtual start address of the allocation. + * + * The private virtual address (VA) range is allocated below io_map_base + * and aligned based on the order of @size. + * + * Return: 0 on success or negative error code on failure. + */ +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) { unsigned long base; int ret = 0; - if (!kvm_host_owns_hyp_mappings()) { - base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, - phys_addr, size, prot); - if (IS_ERR_OR_NULL((void *)base)) - return PTR_ERR((void *)base); - *haddr = base; - - return 0; - } - mutex_lock(&kvm_hyp_pgd_mutex); /* @@ -484,8 +483,10 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, * * The allocated size is always a multiple of PAGE_SIZE. */ - size = PAGE_ALIGN(size + offset_in_page(phys_addr)); - base = io_map_base - size; + base = io_map_base - PAGE_ALIGN(size); + + /* Align the allocation based on the order of its size */ + base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size)); /* * Verify that BIT(VA_BITS - 1) hasn't been flipped by @@ -495,19 +496,40 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, if ((base ^ io_map_base) & BIT(VA_BITS - 1)) ret = -ENOMEM; else - io_map_base = base; + *haddr = io_map_base = base; mutex_unlock(&kvm_hyp_pgd_mutex); + return ret; +} + +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, + unsigned long *haddr, + enum kvm_pgtable_prot prot) +{ + unsigned long addr; + int ret = 0; + + if (!kvm_host_owns_hyp_mappings()) { + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, + phys_addr, size, prot); + if (IS_ERR_VALUE(addr)) + return addr; + *haddr = addr; + + return 0; + } + + size = PAGE_ALIGN(size + offset_in_page(phys_addr)); + ret = hyp_alloc_private_va_range(size, &addr); if (ret) - goto out; + return ret; - ret = __create_hyp_mappings(base, size, phys_addr, prot); + ret = __create_hyp_mappings(addr, size, phys_addr, prot); if (ret) - goto out; + return ret; - *haddr = base + offset_in_page(phys_addr); -out: + *haddr = addr + offset_in_page(phys_addr); return ret; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 3dc990ac4f44..11c43bed5f97 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -774,8 +774,7 @@ void kvm_host_pmu_init(struct arm_pmu *pmu) { struct arm_pmu_entry *entry; - if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_PMUVER_IMP_DEF || - is_protected_kvm_enabled()) + if (pmu->pmuver == 0 || pmu->pmuver == ID_AA64DFR0_PMUVER_IMP_DEF) return; mutex_lock(&arm_pmus_lock); diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 03a6c1f4a09a..7887133d15f0 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,7 +5,8 @@ */ #include <linux/kvm_host.h> #include <linux/perf_event.h> -#include <asm/kvm_hyp.h> + +static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); /* * Given the perf event attributes and system type, determine @@ -25,21 +26,26 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) return (attr->exclude_host != attr->exclude_guest); } +struct kvm_pmu_events *kvm_get_pmu_events(void) +{ + return this_cpu_ptr(&kvm_pmu_events); +} + /* * Add events to track that we may want to switch at guest entry/exit * time. */ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { - struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr)) + if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) - ctx->pmu_events.events_host |= set; + pmu->events_host |= set; if (!attr->exclude_guest) - ctx->pmu_events.events_guest |= set; + pmu->events_guest |= set; } /* @@ -47,13 +53,13 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) */ void kvm_clr_pmu_events(u32 clr) { - struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !ctx) + if (!kvm_arm_support_pmu_v3() || !pmu) return; - ctx->pmu_events.events_host &= ~clr; - ctx->pmu_events.events_guest &= ~clr; + pmu->events_host &= ~clr; + pmu->events_guest &= ~clr; } #define PMEVTYPER_READ_CASE(idx) \ @@ -169,16 +175,16 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) */ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; + struct kvm_pmu_events *pmu; u32 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; preempt_disable(); - host = this_cpu_ptr_hyp_sym(kvm_host_data); - events_guest = host->pmu_events.events_guest; - events_host = host->pmu_events.events_host; + pmu = kvm_get_pmu_events(); + events_guest = pmu->events_guest; + events_host = pmu->events_host; kvm_vcpu_pmu_enable_el0(events_guest); kvm_vcpu_pmu_disable_el0(events_host); @@ -190,15 +196,15 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) */ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { - struct kvm_host_data *host; + struct kvm_pmu_events *pmu; u32 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; - host = this_cpu_ptr_hyp_sym(kvm_host_data); - events_guest = host->pmu_events.events_guest; - events_host = host->pmu_events.events_host; + pmu = kvm_get_pmu_events(); + events_guest = pmu->events_guest; + events_host = pmu->events_host; kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 708d80e8e60d..7fbc4c1b9df0 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -51,13 +51,6 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) return PSCI_RET_SUCCESS; } -static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) -{ - vcpu->arch.power_off = true; - kvm_make_request(KVM_REQ_SLEEP, vcpu); - kvm_vcpu_kick(vcpu); -} - static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, unsigned long affinity) { @@ -83,7 +76,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.power_off) { + if (!kvm_arm_vcpu_stopped(vcpu)) { if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) return PSCI_RET_ALREADY_ON; else @@ -107,12 +100,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); /* - * Make sure the reset request is observed if the change to - * power_off is observed. + * Make sure the reset request is observed if the RUNNABLE mp_state is + * observed. */ smp_wmb(); - vcpu->arch.power_off = false; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; kvm_vcpu_wake_up(vcpu); return PSCI_RET_SUCCESS; @@ -150,7 +143,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) mpidr = kvm_vcpu_get_mpidr_aff(tmp); if ((mpidr & target_affinity_mask) == target_affinity) { matching_cpus++; - if (!tmp->arch.power_off) + if (!kvm_arm_vcpu_stopped(tmp)) return PSCI_0_2_AFFINITY_LEVEL_ON; } } @@ -176,7 +169,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) * re-initialized. */ kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.power_off = true; + tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); @@ -202,6 +195,15 @@ static void kvm_psci_system_reset2(struct kvm_vcpu *vcpu) KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2); } +static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + memset(&run->system_event, 0, sizeof(vcpu->run->system_event)); + run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND; + run->exit_reason = KVM_EXIT_SYSTEM_EVENT; +} + static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) { int i; @@ -245,7 +247,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) val = kvm_psci_vcpu_suspend(vcpu); break; case PSCI_0_2_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); val = PSCI_RET_SUCCESS; break; case PSCI_0_2_FN_CPU_ON: @@ -305,9 +307,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) { + unsigned long val = PSCI_RET_NOT_SUPPORTED; u32 psci_fn = smccc_get_function(vcpu); + struct kvm *kvm = vcpu->kvm; u32 arg; - unsigned long val; int ret = 1; switch(psci_fn) { @@ -320,6 +323,8 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) if (val) break; + val = PSCI_RET_NOT_SUPPORTED; + switch(arg) { case PSCI_0_2_FN_PSCI_VERSION: case PSCI_0_2_FN_CPU_SUSPEND: @@ -336,18 +341,32 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) case ARM_SMCCC_VERSION_FUNC_ID: val = 0; break; + case PSCI_1_0_FN_SYSTEM_SUSPEND: + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) + val = 0; + break; case PSCI_1_1_FN_SYSTEM_RESET2: case PSCI_1_1_FN64_SYSTEM_RESET2: - if (minor >= 1) { + if (minor >= 1) val = 0; - break; - } - fallthrough; - default: - val = PSCI_RET_NOT_SUPPORTED; break; } break; + case PSCI_1_0_FN_SYSTEM_SUSPEND: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_0_FN64_SYSTEM_SUSPEND: + /* + * Return directly to userspace without changing the vCPU's + * registers. Userspace depends on reading the SMCCC parameters + * to implement SYSTEM_SUSPEND. + */ + if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) { + kvm_psci_system_suspend(vcpu); + return 0; + } + break; case PSCI_1_1_FN_SYSTEM_RESET2: kvm_psci_narrow_to_32bit(vcpu); fallthrough; @@ -365,7 +384,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) val = PSCI_RET_INVALID_PARAMS; break; } - fallthrough; + break; default: return kvm_psci_0_2_call(vcpu); } @@ -382,7 +401,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) switch (psci_fn) { case KVM_PSCI_FN_CPU_OFF: - kvm_psci_vcpu_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); val = PSCI_RET_SUCCESS; break; case KVM_PSCI_FN_CPU_ON: @@ -437,186 +456,3 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) return -EINVAL; } } - -int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) -{ - return 4; /* PSCI version and three workaround registers */ -} - -int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) - return -EFAULT; - - if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, uindices++)) - return -EFAULT; - - return 0; -} - -#define KVM_REG_FEATURE_LEVEL_WIDTH 4 -#define KVM_REG_FEATURE_LEVEL_MASK (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1) - -/* - * Convert the workaround level into an easy-to-compare number, where higher - * values mean better protection. - */ -static int get_kernel_wa_level(u64 regid) -{ - switch (regid) { - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - switch (arm64_get_spectre_v2_state()) { - case SPECTRE_VULNERABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case SPECTRE_MITIGATED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; - case SPECTRE_UNAFFECTED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; - } - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - switch (arm64_get_spectre_v4_state()) { - case SPECTRE_MITIGATED: - /* - * As for the hypercall discovery, we pretend we - * don't have any FW mitigation if SSBS is there at - * all times. - */ - if (cpus_have_final_cap(ARM64_SSBS)) - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - fallthrough; - case SPECTRE_UNAFFECTED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - case SPECTRE_VULNERABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - } - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - switch (arm64_get_spectre_bhb_state()) { - case SPECTRE_VULNERABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; - case SPECTRE_MITIGATED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; - case SPECTRE_UNAFFECTED: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; - } - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; - } - - return -EINVAL; -} - -int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - switch (reg->id) { - case KVM_REG_ARM_PSCI_VERSION: - val = kvm_psci_version(vcpu); - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; - break; - default: - return -ENOENT; - } - - if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - return 0; -} - -int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - int wa_level; - - if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id))) - return -EFAULT; - - switch (reg->id) { - case KVM_REG_ARM_PSCI_VERSION: - { - bool wants_02; - - wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features); - - switch (val) { - case KVM_ARM_PSCI_0_1: - if (wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - case KVM_ARM_PSCI_0_2: - case KVM_ARM_PSCI_1_0: - case KVM_ARM_PSCI_1_1: - if (!wants_02) - return -EINVAL; - vcpu->kvm->arch.psci_version = val; - return 0; - } - break; - } - - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - if (val & ~KVM_REG_FEATURE_LEVEL_MASK) - return -EINVAL; - - if (get_kernel_wa_level(reg->id) < val) - return -EINVAL; - - return 0; - - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - if (val & ~(KVM_REG_FEATURE_LEVEL_MASK | - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) - return -EINVAL; - - /* The enabled bit must not be set unless the level is AVAIL. */ - if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) && - (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL) - return -EINVAL; - - /* - * Map all the possible incoming states to the only two we - * really want to deal with. - */ - switch (val & KVM_REG_FEATURE_LEVEL_MASK) { - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: - wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - break; - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: - case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: - wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - break; - default: - return -EINVAL; - } - - /* - * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the - * other way around. - */ - if (get_kernel_wa_level(reg->id) < wa_level) - return -EINVAL; - - return 0; - default: - return -ENOENT; - } - - return -EINVAL; -} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 18b403b58b53..c06c0477fab5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1145,6 +1145,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_GPA3)); + if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_WFXT); break; case SYS_ID_AA64DFR0_EL1: /* Limit debug to ARMv8.0 */ @@ -2020,20 +2022,22 @@ static const struct sys_reg_desc cp14_64_regs[] = { { Op1( 0), CRm( 2), .access = trap_raz_wi }, }; +#define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2) \ + AA32(_map), \ + Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2), \ + .visibility = pmu_visibility + /* Macro to expand the PMEVCNTRn register */ #define PMU_PMEVCNTR(n) \ - /* PMEVCNTRn */ \ - { Op1(0), CRn(0b1110), \ - CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ - access_pmu_evcntr } + { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ + (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ + .access = access_pmu_evcntr } /* Macro to expand the PMEVTYPERn register */ #define PMU_PMEVTYPER(n) \ - /* PMEVTYPERn */ \ - { Op1(0), CRn(0b1110), \ - CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ - access_pmu_evtyper } - + { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ + (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ + .access = access_pmu_evtyper } /* * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding, * depending on the way they are accessed (as a 32bit or a 64bit @@ -2073,25 +2077,25 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, /* PMU */ - { Op1( 0), CRn( 9), CRm(12), Op2( 0), access_pmcr }, - { Op1( 0), CRn( 9), CRm(12), Op2( 1), access_pmcnten }, - { Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmcnten }, - { Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs }, - { Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc }, - { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr }, - { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid }, - { AA32(LO), Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid }, - { Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_evcntr }, - { Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_evtyper }, - { Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_evcntr }, - { Op1( 0), CRn( 9), CRm(14), Op2( 0), access_pmuserenr }, - { Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pminten }, - { Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten }, - { Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs }, - { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 4), access_pmceid }, - { AA32(HI), Op1( 0), CRn( 9), CRm(14), Op2( 5), access_pmceid }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr }, + { CP15_PMU_SYS_REG(LO, 0, 9, 12, 6), .access = access_pmceid }, + { CP15_PMU_SYS_REG(LO, 0, 9, 12, 7), .access = access_pmceid }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs }, + { CP15_PMU_SYS_REG(HI, 0, 9, 14, 4), .access = access_pmceid }, + { CP15_PMU_SYS_REG(HI, 0, 9, 14, 5), .access = access_pmceid }, /* PMMIR */ - { Op1( 0), CRn( 9), CRm(14), Op2( 6), trap_raz_wi }, + { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi }, /* PRRR/MAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, @@ -2176,7 +2180,7 @@ static const struct sys_reg_desc cp15_regs[] = { PMU_PMEVTYPER(29), PMU_PMEVTYPER(30), /* PMCCFILTR */ - { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper }, + { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper }, { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, @@ -2185,7 +2189,7 @@ static const struct sys_reg_desc cp15_regs[] = { static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, - { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr }, + { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ @@ -2193,25 +2197,24 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; -static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, - bool is_32) +static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, + bool is_32) { unsigned int i; for (i = 0; i < n; i++) { if (!is_32 && table[i].reg && !table[i].reset) { - kvm_err("sys_reg table %p entry %d has lacks reset\n", - table, i); - return 1; + kvm_err("sys_reg table %pS entry %d lacks reset\n", &table[i], i); + return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1); - return 1; + kvm_err("sys_reg table %pS entry %d out of order\n", &table[i - 1], i - 1); + return false; } } - return 0; + return true; } int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) @@ -2252,27 +2255,27 @@ static void perform_access(struct kvm_vcpu *vcpu, * @table: array of trap descriptors * @num: size of the trap descriptor array * - * Return 0 if the access has been handled, and -1 if not. + * Return true if the access has been handled, false if not. */ -static int emulate_cp(struct kvm_vcpu *vcpu, - struct sys_reg_params *params, - const struct sys_reg_desc *table, - size_t num) +static bool emulate_cp(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *table, + size_t num) { const struct sys_reg_desc *r; if (!table) - return -1; /* Not handled */ + return false; /* Not handled */ r = find_reg(params, table, num); if (r) { perform_access(vcpu, params, r); - return 0; + return true; } /* Not handled */ - return -1; + return false; } static void unhandled_cp_access(struct kvm_vcpu *vcpu, @@ -2336,7 +2339,7 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, * potential register operation in the case of a read and return * with success. */ - if (!emulate_cp(vcpu, ¶ms, global, nr_global)) { + if (emulate_cp(vcpu, ¶ms, global, nr_global)) { /* Split up the value between registers for the read side */ if (!params.is_write) { vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); @@ -2350,34 +2353,144 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, return 1; } +static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params); + +/* + * The CP10 ID registers are architecturally mapped to AArch64 feature + * registers. Abuse that fact so we can rely on the AArch64 handler for accesses + * from AArch32. + */ +static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params) +{ + u8 reg_id = (esr >> 10) & 0xf; + bool valid; + + params->is_write = ((esr & 1) == 0); + params->Op0 = 3; + params->Op1 = 0; + params->CRn = 0; + params->CRm = 3; + + /* CP10 ID registers are read-only */ + valid = !params->is_write; + + switch (reg_id) { + /* MVFR0 */ + case 0b0111: + params->Op2 = 0; + break; + /* MVFR1 */ + case 0b0110: + params->Op2 = 1; + break; + /* MVFR2 */ + case 0b0101: + params->Op2 = 2; + break; + default: + valid = false; + } + + if (valid) + return true; + + kvm_pr_unimpl("Unhandled cp10 register %s: %u\n", + params->is_write ? "write" : "read", reg_id); + return false; +} + +/** + * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and + * VFP Register' from AArch32. + * @vcpu: The vCPU pointer + * + * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers. + * Work out the correct AArch64 system register encoding and reroute to the + * AArch64 system register emulation. + */ +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu) +{ + int Rt = kvm_vcpu_sys_get_rt(vcpu); + u64 esr = kvm_vcpu_get_esr(vcpu); + struct sys_reg_params params; + + /* UNDEF on any unhandled register access */ + if (!kvm_esr_cp10_id_to_sys64(esr, ¶ms)) { + kvm_inject_undefined(vcpu); + return 1; + } + + if (emulate_sys_reg(vcpu, ¶ms)) + vcpu_set_reg(vcpu, Rt, params.regval); + + return 1; +} + +/** + * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where + * CRn=0, which corresponds to the AArch32 feature + * registers. + * @vcpu: the vCPU pointer + * @params: the system register access parameters. + * + * Our cp15 system register tables do not enumerate the AArch32 feature + * registers. Conveniently, our AArch64 table does, and the AArch32 system + * register encoding can be trivially remapped into the AArch64 for the feature + * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same. + * + * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit + * System registers with (coproc=0b1111, CRn==c0)", read accesses from this + * range are either UNKNOWN or RES0. Rerouting remains architectural as we + * treat undefined registers in this range as RAZ. + */ +static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, + struct sys_reg_params *params) +{ + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + /* Treat impossible writes to RO registers as UNDEFINED */ + if (params->is_write) { + unhandled_cp_access(vcpu, params); + return 1; + } + + params->Op0 = 3; + + /* + * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32. + * Avoid conflicting with future expansion of AArch64 feature registers + * and simply treat them as RAZ here. + */ + if (params->CRm > 3) + params->regval = 0; + else if (!emulate_sys_reg(vcpu, params)) + return 1; + + vcpu_set_reg(vcpu, Rt, params->regval); + return 1; +} + /** * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer * @run: The kvm_run struct */ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, const struct sys_reg_desc *global, size_t nr_global) { - struct sys_reg_params params; - u64 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); - params.CRm = (esr >> 1) & 0xf; - params.regval = vcpu_get_reg(vcpu, Rt); - params.is_write = ((esr & 1) == 0); - params.CRn = (esr >> 10) & 0xf; - params.Op0 = 0; - params.Op1 = (esr >> 14) & 0x7; - params.Op2 = (esr >> 17) & 0x7; + params->regval = vcpu_get_reg(vcpu, Rt); - if (!emulate_cp(vcpu, ¶ms, global, nr_global)) { - if (!params.is_write) - vcpu_set_reg(vcpu, Rt, params.regval); + if (emulate_cp(vcpu, params, global, nr_global)) { + if (!params->is_write) + vcpu_set_reg(vcpu, Rt, params->regval); return 1; } - unhandled_cp_access(vcpu, ¶ms); + unhandled_cp_access(vcpu, params); return 1; } @@ -2388,7 +2501,20 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) { - return kvm_handle_cp_32(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs)); + struct sys_reg_params params; + + params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); + + /* + * Certain AArch32 ID registers are handled by rerouting to the AArch64 + * system register table. Registers in the ID range where CRm=0 are + * excluded from this scheme as they do not trivially map into AArch64 + * system register encodings. + */ + if (params.Op1 == 0 && params.CRn == 0 && params.CRm) + return kvm_emulate_cp15_id_reg(vcpu, ¶ms); + + return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); } int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) @@ -2398,7 +2524,11 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) { - return kvm_handle_cp_32(vcpu, cp14_regs, ARRAY_SIZE(cp14_regs)); + struct sys_reg_params params; + + params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); + + return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); } static bool is_imp_def_sys_reg(struct sys_reg_params *params) @@ -2407,7 +2537,14 @@ static bool is_imp_def_sys_reg(struct sys_reg_params *params) return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011; } -static int emulate_sys_reg(struct kvm_vcpu *vcpu, +/** + * emulate_sys_reg - Emulate a guest access to an AArch64 system register + * @vcpu: The VCPU pointer + * @params: Decoded system register parameters + * + * Return: true if the system register access was successful, false otherwise. + */ +static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { const struct sys_reg_desc *r; @@ -2416,7 +2553,10 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, if (likely(r)) { perform_access(vcpu, params, r); - } else if (is_imp_def_sys_reg(params)) { + return true; + } + + if (is_imp_def_sys_reg(params)) { kvm_inject_undefined(vcpu); } else { print_sys_reg_msg(params, @@ -2424,7 +2564,7 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); } - return 1; + return false; } /** @@ -2452,18 +2592,18 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); - int ret; trace_kvm_handle_sys_reg(esr); params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - ret = emulate_sys_reg(vcpu, ¶ms); + if (!emulate_sys_reg(vcpu, ¶ms)) + return 1; if (!params.is_write) vcpu_set_reg(vcpu, Rt, params.regval); - return ret; + return 1; } /****************************************************************************** @@ -2866,18 +3006,22 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -void kvm_sys_reg_table_init(void) +int kvm_sys_reg_table_init(void) { + bool valid = true; unsigned int i; struct sys_reg_desc clidr; /* Make sure tables are unique and in order. */ - BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false)); - BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true)); - BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true)); - BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true)); - BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true)); - BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false)); + valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); + valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true); + valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); + valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); + valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); + valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); + + if (!valid) + return -EINVAL; /* We abuse the reset function to overwrite the table itself. */ for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) @@ -2900,4 +3044,6 @@ void kvm_sys_reg_table_init(void) break; /* Clear all higher bits. */ cache_levels &= (1 << (i*3))-1; + + return 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cc0cc95a0280..aee8ea054f0d 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -35,12 +35,19 @@ struct sys_reg_params { .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) +#define esr_cp1x_32_to_params(esr) \ + ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; enum { - AA32_ZEROHIGH, + AA32_DIRECT, AA32_LO, AA32_HI, } aarch32_map; diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index fc00304fe7d8..f6d4f4052555 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -98,11 +98,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) ret = 0; if (type == KVM_DEV_TYPE_ARM_VGIC_V2) - kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; + kvm->max_vcpus = VGIC_V2_MAX_CPUS; else - kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; + kvm->max_vcpus = VGIC_V3_MAX_CPUS; - if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { + if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) { ret = -E2BIG; goto out_unlock; } @@ -319,7 +319,12 @@ int vgic_init(struct kvm *kvm) vgic_debug_init(kvm); - dist->implementation_rev = 2; + /* + * If userspace didn't set the GIC implementation revision, + * default to the latest and greatest. You know want it. + */ + if (!dist->implementation_rev) + dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST; dist->initialized = true; out: diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 2e13402be3bd..9d3299a70242 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -683,7 +683,7 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, if (!vcpu) return E_ITS_INT_UNMAPPED_INTERRUPT; - if (!vcpu->arch.vgic_cpu.lpis_enabled) + if (!vgic_lpis_enabled(vcpu)) return -EBUSY; vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); @@ -894,6 +894,18 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, return update_affinity(ite->irq, vcpu); } +static bool __is_visible_gfn_locked(struct vgic_its *its, gpa_t gpa) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int idx; + bool ret; + + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; +} + /* * Check whether an ID can be stored into the corresponding guest table. * For a direct table this is pretty easy, but gets a bit nasty for @@ -908,9 +920,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, u64 indirect_ptr, type = GITS_BASER_TYPE(baser); phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); - int index, idx; - gfn_t gfn; - bool ret; + int index; switch (type) { case GITS_BASER_TYPE_DEVICE: @@ -933,12 +943,11 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, return false; addr = base + id * esz; - gfn = addr >> PAGE_SHIFT; if (eaddr) *eaddr = addr; - goto out; + return __is_visible_gfn_locked(its, addr); } /* calculate and check the index into the 1st level */ @@ -964,27 +973,42 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, /* Find the address of the actual entry */ index = id % (SZ_64K / esz); indirect_ptr += index * esz; - gfn = indirect_ptr >> PAGE_SHIFT; if (eaddr) *eaddr = indirect_ptr; -out: - idx = srcu_read_lock(&its->dev->kvm->srcu); - ret = kvm_is_visible_gfn(its->dev->kvm, gfn); - srcu_read_unlock(&its->dev->kvm->srcu, idx); - return ret; + return __is_visible_gfn_locked(its, indirect_ptr); } +/* + * Check whether an event ID can be stored in the corresponding Interrupt + * Translation Table, which starts at device->itt_addr. + */ +static bool vgic_its_check_event_id(struct vgic_its *its, struct its_device *device, + u32 event_id) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ite_esz = abi->ite_esz; + gpa_t gpa; + + /* max table size is: BIT_ULL(device->num_eventid_bits) * ite_esz */ + if (event_id >= BIT_ULL(device->num_eventid_bits)) + return false; + + gpa = device->itt_addr + event_id * ite_esz; + return __is_visible_gfn_locked(its, gpa); +} + +/* + * Add a new collection into the ITS collection table. + * Returns 0 on success, and a negative error value for generic errors. + */ static int vgic_its_alloc_collection(struct vgic_its *its, struct its_collection **colp, u32 coll_id) { struct its_collection *collection; - if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) - return E_ITS_MAPC_COLLECTION_OOR; - collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT); if (!collection) return -ENOMEM; @@ -1061,7 +1085,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, if (!device) return E_ITS_MAPTI_UNMAPPED_DEVICE; - if (event_id >= BIT_ULL(device->num_eventid_bits)) + if (!vgic_its_check_event_id(its, device, event_id)) return E_ITS_MAPTI_ID_OOR; if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) @@ -1078,7 +1102,12 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, collection = find_collection(its, coll_id); if (!collection) { - int ret = vgic_its_alloc_collection(its, &collection, coll_id); + int ret; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + + ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; new_coll = collection; @@ -1233,6 +1262,10 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, if (!collection) { int ret; + if (!vgic_its_check_id(its, its->baser_coll_table, + coll_id, NULL)) + return E_ITS_MAPC_COLLECTION_OOR; + ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) @@ -1272,6 +1305,11 @@ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, return 0; } +int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq) +{ + return update_lpi_config(kvm, irq, NULL, true); +} + /* * The INV command syncs the configuration bits from the memory table. * Must be called with the its_lock mutex held. @@ -1288,7 +1326,41 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, if (!ite) return E_ITS_INV_UNMAPPED_INTERRUPT; - return update_lpi_config(kvm, ite->irq, NULL, true); + return vgic_its_inv_lpi(kvm, ite->irq); +} + +/** + * vgic_its_invall - invalidate all LPIs targetting a given vcpu + * @vcpu: the vcpu for which the RD is targetted by an invalidation + * + * Contrary to the INVALL command, this targets a RD instead of a + * collection, and we don't need to hold the its_lock, since no ITS is + * involved here. + */ +int vgic_its_invall(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int irq_count, i = 0; + u32 *intids; + + irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; + update_lpi_config(kvm, irq, vcpu, false); + vgic_put_irq(kvm, irq); + } + + kfree(intids); + + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) + its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + + return 0; } /* @@ -1305,32 +1377,13 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, u32 coll_id = its_cmd_get_collection(its_cmd); struct its_collection *collection; struct kvm_vcpu *vcpu; - struct vgic_irq *irq; - u32 *intids; - int irq_count, i; collection = find_collection(its, coll_id); if (!its_is_collection_mapped(collection)) return E_ITS_INVALL_UNMAPPED_COLLECTION; vcpu = kvm_get_vcpu(kvm, collection->target_addr); - - irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); - if (irq_count < 0) - return irq_count; - - for (i = 0; i < irq_count; i++) { - irq = vgic_get_irq(kvm, NULL, intids[i]); - if (!irq) - continue; - update_lpi_config(kvm, irq, vcpu, false); - vgic_put_irq(kvm, irq); - } - - kfree(intids); - - if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) - its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + vgic_its_invall(vcpu); return 0; } @@ -2175,6 +2228,9 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, if (!collection) return -EINVAL; + if (!vgic_its_check_event_id(its, dev, event_id)) + return -EINVAL; + ite = vgic_its_alloc_ite(dev, collection, event_id); if (IS_ERR(ite)) return PTR_ERR(ite); @@ -2183,8 +2239,10 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id, vcpu = kvm_get_vcpu(kvm, collection->target_addr); irq = vgic_add_lpi(kvm, lpi_id, vcpu); - if (IS_ERR(irq)) + if (IS_ERR(irq)) { + its_free_ite(kvm, ite); return PTR_ERR(irq); + } ite->irq = irq; return offset; @@ -2296,6 +2354,7 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, void *ptr, void *opaque) { struct its_device *dev; + u64 baser = its->baser_device_table; gpa_t itt_addr; u8 num_eventid_bits; u64 entry = *(u64 *)ptr; @@ -2316,6 +2375,9 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id, /* dte entry is valid */ offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT; + if (!vgic_its_check_id(its, baser, id, NULL)) + return -EINVAL; + dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -2445,6 +2507,9 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) if (ret > 0) ret = 0; + if (ret < 0) + vgic_its_free_device_list(its->dev->kvm, its); + return ret; } @@ -2461,6 +2526,11 @@ static int vgic_its_save_cte(struct vgic_its *its, return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); } +/* + * Restore a collection entry into the ITS collection table. + * Return +1 on success, 0 if the entry was invalid (which should be + * interpreted as end-of-table), and a negative error value for generic errors. + */ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) { struct its_collection *collection; @@ -2487,6 +2557,10 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) collection = find_collection(its, coll_id); if (collection) return -EEXIST; + + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) + return -EINVAL; + ret = vgic_its_alloc_collection(its, &collection, coll_id); if (ret) return ret; @@ -2566,6 +2640,9 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) if (ret > 0) return 0; + if (ret < 0) + vgic_its_free_collection_list(its->dev->kvm, its); + return ret; } @@ -2597,7 +2674,10 @@ static int vgic_its_restore_tables_v0(struct vgic_its *its) if (ret) return ret; - return vgic_its_restore_device_tables(its); + ret = vgic_its_restore_device_tables(its); + if (ret) + vgic_its_free_collection_list(its->dev->kvm, its); + return ret; } static int vgic_its_commit_v0(struct vgic_its *its) diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index 12e4c223e6b8..77a67e9d3d14 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -73,9 +73,13 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u32 reg; + switch (addr & 0x0c) { case GIC_DIST_IIDR: - if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) + reg = vgic_mmio_read_v2_misc(vcpu, addr, len); + if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) return -EINVAL; /* @@ -87,8 +91,16 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, * migration from old kernels to new kernels with legacy * userspace. */ - vcpu->kvm->arch.vgic.v2_groups_user_writable = true; - return 0; + reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); + switch (reg) { + case KVM_VGIC_IMP_REV_2: + case KVM_VGIC_IMP_REV_3: + vcpu->kvm->arch.vgic.v2_groups_user_writable = true; + dist->implementation_rev = reg; + return 0; + default: + return -EINVAL; + } } vgic_mmio_write_v2_misc(vcpu, addr, len, val); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 58e40b4874f8..f7aa7bcd6fb8 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -155,13 +155,27 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u32 reg; switch (addr & 0x0c) { case GICD_TYPER2: - case GICD_IIDR: if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) return -EINVAL; return 0; + case GICD_IIDR: + reg = vgic_mmio_read_v3_misc(vcpu, addr, len); + if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK) + return -EINVAL; + + reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg); + switch (reg) { + case KVM_VGIC_IMP_REV_2: + case KVM_VGIC_IMP_REV_3: + dist->implementation_rev = reg; + return 0; + default: + return -EINVAL; + } case GICD_CTLR: /* Not a GICv4.1? No HW SGIs */ if (!kvm_vgic_global_state.has_gicv4_1) @@ -221,34 +235,58 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, vgic_put_irq(vcpu->kvm, irq); } +bool vgic_lpis_enabled(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return atomic_read(&vgic_cpu->ctlr) == GICR_CTLR_ENABLE_LPIS; +} + static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + unsigned long val; - return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; -} + val = atomic_read(&vgic_cpu->ctlr); + if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_3) + val |= GICR_CTLR_IR | GICR_CTLR_CES; + return val; +} static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - bool was_enabled = vgic_cpu->lpis_enabled; + u32 ctlr; if (!vgic_has_its(vcpu->kvm)) return; - vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + if (!(val & GICR_CTLR_ENABLE_LPIS)) { + /* + * Don't disable if RWP is set, as there already an + * ongoing disable. Funky guest... + */ + ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, + GICR_CTLR_ENABLE_LPIS, + GICR_CTLR_RWP); + if (ctlr != GICR_CTLR_ENABLE_LPIS) + return; - if (was_enabled && !vgic_cpu->lpis_enabled) { vgic_flush_pending_lpis(vcpu); vgic_its_invalidate_cache(vcpu->kvm); - } + atomic_set_release(&vgic_cpu->ctlr, 0); + } else { + ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0, + GICR_CTLR_ENABLE_LPIS); + if (ctlr != 0) + return; - if (!was_enabled && vgic_cpu->lpis_enabled) vgic_enable_lpis(vcpu); + } } static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu) @@ -478,11 +516,10 @@ static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; u64 old_propbaser, propbaser; /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) + if (vgic_lpis_enabled(vcpu)) return; do { @@ -513,7 +550,7 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, u64 old_pendbaser, pendbaser; /* Storing a value with LPIs already enabled is undefined */ - if (vgic_cpu->lpis_enabled) + if (vgic_lpis_enabled(vcpu)) return; do { @@ -525,6 +562,63 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, pendbaser) != old_pendbaser); } +static unsigned long vgic_mmio_read_sync(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return !!atomic_read(&vcpu->arch.vgic_cpu.syncr_busy); +} + +static void vgic_set_rdist_busy(struct kvm_vcpu *vcpu, bool busy) +{ + if (busy) { + atomic_inc(&vcpu->arch.vgic_cpu.syncr_busy); + smp_mb__after_atomic(); + } else { + smp_mb__before_atomic(); + atomic_dec(&vcpu->arch.vgic_cpu.syncr_busy); + } +} + +static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_irq *irq; + + /* + * If the guest wrote only to the upper 32bit part of the + * register, drop the write on the floor, as it is only for + * vPEs (which we don't support for obvious reasons). + * + * Also discard the access if LPIs are not enabled. + */ + if ((addr & 4) || !vgic_lpis_enabled(vcpu)) + return; + + vgic_set_rdist_busy(vcpu, true); + + irq = vgic_get_irq(vcpu->kvm, NULL, lower_32_bits(val)); + if (irq) { + vgic_its_inv_lpi(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); + } + + vgic_set_rdist_busy(vcpu, false); +} + +static void vgic_mmio_write_invall(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* See vgic_mmio_write_invlpi() for the early return rationale */ + if ((addr & 4) || !vgic_lpis_enabled(vcpu)) + return; + + vgic_set_rdist_busy(vcpu, true); + vgic_its_invall(vcpu); + vgic_set_rdist_busy(vcpu, false); +} + /* * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the * redistributors, while SPIs are covered by registers in the distributor @@ -630,6 +724,15 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_INVLPIR, + vgic_mmio_read_raz, vgic_mmio_write_invlpi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_INVALLR, + vgic_mmio_read_raz, vgic_mmio_write_invall, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_SYNCR, + vgic_mmio_read_sync, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, VGIC_ACCESS_32bit), diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index b549af8b1dc2..826ff6f2a4e7 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -612,6 +612,10 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), {}, }; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 3fd6c86a7ef3..4c6bdd321faa 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -98,6 +98,11 @@ #define DEBUG_SPINLOCK_BUG_ON(p) #endif +static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.vgic.implementation_rev; +} + /* Requires the irq_lock to be held by the caller. */ static inline bool irq_is_pending(struct vgic_irq *irq) { @@ -308,6 +313,7 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); } +bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); @@ -317,6 +323,10 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm); void vgic_lpi_translation_cache_destroy(struct kvm *kvm); void vgic_its_invalidate_cache(struct kvm *kvm); +/* GICv4.1 MMIO interface */ +int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); +int vgic_its_invall(struct kvm_vcpu *vcpu); + bool vgic_supports_direct_msis(struct kvm *kvm); int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index 1688af0a4c97..5b7890139bc2 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -27,7 +27,17 @@ void __delay(unsigned long cycles) { cycles_t start = get_cycles(); - if (arch_timer_evtstrm_available()) { + if (cpus_have_const_cap(ARM64_HAS_WFXT)) { + u64 end = start + cycles; + + /* + * Start with WFIT. If an interrupt makes us resume + * early, use a WFET loop to complete the delay. + */ + wfit(end); + while ((get_cycles() - start) < cycles) + wfet(end); + } else if (arch_timer_evtstrm_available()) { const cycles_t timer_evt_period = USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US); diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index e52b289a27c2..507b20373953 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -38,6 +38,7 @@ HAS_STAGE2_FWB HAS_SYSREG_GIC_CPUIF HAS_TLB_RANGE HAS_VIRT_HOST_EXTN +HAS_WFXT HW_DBM KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index e935f27b10fd..cc40521e438b 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -117,6 +117,7 @@ #define HGATP_MODE_SV32X4 _AC(1, UL) #define HGATP_MODE_SV39X4 _AC(8, UL) #define HGATP_MODE_SV48X4 _AC(9, UL) +#define HGATP_MODE_SV57X4 _AC(10, UL) #define HGATP32_MODE_SHIFT 31 #define HGATP32_VMID_SHIFT 22 diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index cd4bbcecb0fb..319c8aeb42af 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -12,12 +12,12 @@ #include <linux/types.h> #include <linux/kvm.h> #include <linux/kvm_types.h> +#include <linux/spinlock.h> #include <asm/csr.h> #include <asm/kvm_vcpu_fp.h> #include <asm/kvm_vcpu_timer.h> -#define KVM_MAX_VCPUS \ - ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1) +#define KVM_MAX_VCPUS 1024 #define KVM_HALT_POLL_NS_DEFAULT 500000 @@ -27,6 +27,31 @@ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(1) #define KVM_REQ_UPDATE_HGATP KVM_ARCH_REQ(2) +#define KVM_REQ_FENCE_I \ + KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HFENCE_GVMA_VMID_ALL KVM_REQ_TLB_FLUSH +#define KVM_REQ_HFENCE_VVMA_ALL \ + KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HFENCE \ + KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) + +enum kvm_riscv_hfence_type { + KVM_RISCV_HFENCE_UNKNOWN = 0, + KVM_RISCV_HFENCE_GVMA_VMID_GPA, + KVM_RISCV_HFENCE_VVMA_ASID_GVA, + KVM_RISCV_HFENCE_VVMA_ASID_ALL, + KVM_RISCV_HFENCE_VVMA_GVA, +}; + +struct kvm_riscv_hfence { + enum kvm_riscv_hfence_type type; + unsigned long asid; + unsigned long order; + gpa_t addr; + gpa_t size; +}; + +#define KVM_RISCV_VCPU_MAX_HFENCE 64 struct kvm_vm_stat { struct kvm_vm_stat_generic generic; @@ -54,10 +79,10 @@ struct kvm_vmid { }; struct kvm_arch { - /* stage2 vmid */ + /* G-stage vmid */ struct kvm_vmid vmid; - /* stage2 page table */ + /* G-stage page table */ pgd_t *pgd; phys_addr_t pgd_phys; @@ -141,6 +166,9 @@ struct kvm_vcpu_arch { /* VCPU ran at least once */ bool ran_atleast_once; + /* Last Host CPU on which Guest VCPU exited */ + int last_exit_cpu; + /* ISA feature bits (similar to MISA) */ unsigned long isa; @@ -179,6 +207,12 @@ struct kvm_vcpu_arch { /* VCPU Timer */ struct kvm_vcpu_timer timer; + /* HFENCE request queue */ + spinlock_t hfence_lock; + unsigned long hfence_head; + unsigned long hfence_tail; + struct kvm_riscv_hfence hfence_queue[KVM_RISCV_VCPU_MAX_HFENCE]; + /* MMIO instruction details */ struct kvm_mmio_decode mmio_decode; @@ -201,27 +235,71 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} #define KVM_ARCH_WANT_MMU_NOTIFIER -void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa_divby_4, - unsigned long vmid); -void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); -void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa_divby_4); -void __kvm_riscv_hfence_gvma_all(void); - -int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, +#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 + +void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, + gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid); +void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_local_hfence_gvma_all(void); +void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, + unsigned long asid, + unsigned long gva, + unsigned long gvsz, + unsigned long order); +void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, + unsigned long asid); +void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, + unsigned long gva, unsigned long gvsz, + unsigned long order); +void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); + +void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu); + +void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu); + +void kvm_riscv_fence_i(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); +void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); +void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order, unsigned long asid); +void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long asid); +void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order); +void kvm_riscv_hfence_vvma_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); + +int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write); -int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm); -void kvm_riscv_stage2_free_pgd(struct kvm *kvm); -void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu); -void kvm_riscv_stage2_mode_detect(void); -unsigned long kvm_riscv_stage2_mode(void); -int kvm_riscv_stage2_gpa_bits(void); - -void kvm_riscv_stage2_vmid_detect(void); -unsigned long kvm_riscv_stage2_vmid_bits(void); -int kvm_riscv_stage2_vmid_init(struct kvm *kvm); -bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid); -void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu); +int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); +void kvm_riscv_gstage_free_pgd(struct kvm *kvm); +void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); +void kvm_riscv_gstage_mode_detect(void); +unsigned long kvm_riscv_gstage_mode(void); +int kvm_riscv_gstage_gpa_bits(void); + +void kvm_riscv_gstage_vmid_detect(void); +unsigned long kvm_riscv_gstage_vmid_bits(void); +int kvm_riscv_gstage_vmid_init(struct kvm *kvm); +bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); +void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); void __kvm_riscv_unpriv_trap(void); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index f808ad1ce500..6119368ba6d5 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -82,6 +82,23 @@ struct kvm_riscv_timer { __u64 state; }; +/* + * ISA extension IDs specific to KVM. This is not the same as the host ISA + * extension IDs as that is internal to the host and should not be exposed + * to the guest. This should always be contiguous to keep the mapping simple + * in KVM implementation. + */ +enum KVM_RISCV_ISA_EXT_ID { + KVM_RISCV_ISA_EXT_A = 0, + KVM_RISCV_ISA_EXT_C, + KVM_RISCV_ISA_EXT_D, + KVM_RISCV_ISA_EXT_F, + KVM_RISCV_ISA_EXT_H, + KVM_RISCV_ISA_EXT_I, + KVM_RISCV_ISA_EXT_M, + KVM_RISCV_ISA_EXT_MAX, +}; + /* Possible states for kvm_riscv_timer */ #define KVM_RISCV_TIMER_STATE_OFF 0 #define KVM_RISCV_TIMER_STATE_ON 1 @@ -123,6 +140,9 @@ struct kvm_riscv_timer { #define KVM_REG_RISCV_FP_D_REG(name) \ (offsetof(struct __riscv_d_ext_state, name) / sizeof(__u64)) +/* ISA Extension registers are mapped as type 7 */ +#define KVM_REG_RISCV_ISA_EXT (0x07 << KVM_REG_RISCV_TYPE_SHIFT) + #endif #endif /* __LINUX_KVM_RISCV_H */ diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 2e5ca43c8c49..1549205fe5fe 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -89,13 +89,13 @@ int kvm_arch_init(void *opaque) return -ENODEV; } - kvm_riscv_stage2_mode_detect(); + kvm_riscv_gstage_mode_detect(); - kvm_riscv_stage2_vmid_detect(); + kvm_riscv_gstage_vmid_detect(); kvm_info("hypervisor extension available\n"); - switch (kvm_riscv_stage2_mode()) { + switch (kvm_riscv_gstage_mode()) { case HGATP_MODE_SV32X4: str = "Sv32x4"; break; @@ -105,12 +105,15 @@ int kvm_arch_init(void *opaque) case HGATP_MODE_SV48X4: str = "Sv48x4"; break; + case HGATP_MODE_SV57X4: + str = "Sv57x4"; + break; default: return -ENODEV; } kvm_info("using %s G-stage page table format\n", str); - kvm_info("VMID %ld bits available\n", kvm_riscv_stage2_vmid_bits()); + kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); return 0; } diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index f80a34fbf102..1c00695ebee7 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -18,53 +18,52 @@ #include <asm/csr.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/sbi.h> #ifdef CONFIG_64BIT -static unsigned long stage2_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); -static unsigned long stage2_pgd_levels = 3; -#define stage2_index_bits 9 +static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels = 3; +#define gstage_index_bits 9 #else -static unsigned long stage2_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); -static unsigned long stage2_pgd_levels = 2; -#define stage2_index_bits 10 +static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels = 2; +#define gstage_index_bits 10 #endif -#define stage2_pgd_xbits 2 -#define stage2_pgd_size (1UL << (HGATP_PAGE_SHIFT + stage2_pgd_xbits)) -#define stage2_gpa_bits (HGATP_PAGE_SHIFT + \ - (stage2_pgd_levels * stage2_index_bits) + \ - stage2_pgd_xbits) -#define stage2_gpa_size ((gpa_t)(1ULL << stage2_gpa_bits)) +#define gstage_pgd_xbits 2 +#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits)) +#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \ + (gstage_pgd_levels * gstage_index_bits) + \ + gstage_pgd_xbits) +#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits)) -#define stage2_pte_leaf(__ptep) \ +#define gstage_pte_leaf(__ptep) \ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) -static inline unsigned long stage2_pte_index(gpa_t addr, u32 level) +static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) { unsigned long mask; - unsigned long shift = HGATP_PAGE_SHIFT + (stage2_index_bits * level); + unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level); - if (level == (stage2_pgd_levels - 1)) - mask = (PTRS_PER_PTE * (1UL << stage2_pgd_xbits)) - 1; + if (level == (gstage_pgd_levels - 1)) + mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1; else mask = PTRS_PER_PTE - 1; return (addr >> shift) & mask; } -static inline unsigned long stage2_pte_page_vaddr(pte_t pte) +static inline unsigned long gstage_pte_page_vaddr(pte_t pte) { return (unsigned long)pfn_to_virt(pte_val(pte) >> _PAGE_PFN_SHIFT); } -static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level) +static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) { u32 i; unsigned long psz = 1UL << 12; - for (i = 0; i < stage2_pgd_levels; i++) { - if (page_size == (psz << (i * stage2_index_bits))) { + for (i = 0; i < gstage_pgd_levels; i++) { + if (page_size == (psz << (i * gstage_index_bits))) { *out_level = i; return 0; } @@ -73,27 +72,39 @@ static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level) return -EINVAL; } -static int stage2_level_to_page_size(u32 level, unsigned long *out_pgsize) +static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) { - if (stage2_pgd_levels < level) + if (gstage_pgd_levels < level) return -EINVAL; - *out_pgsize = 1UL << (12 + (level * stage2_index_bits)); + *out_pgorder = 12 + (level * gstage_index_bits); + return 0; +} + +static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) +{ + int rc; + unsigned long page_order = PAGE_SHIFT; + + rc = gstage_level_to_page_order(level, &page_order); + if (rc) + return rc; + *out_pgsize = BIT(page_order); return 0; } -static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, +static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { pte_t *ptep; - u32 current_level = stage2_pgd_levels - 1; + u32 current_level = gstage_pgd_levels - 1; *ptep_level = current_level; ptep = (pte_t *)kvm->arch.pgd; - ptep = &ptep[stage2_pte_index(addr, current_level)]; + ptep = &ptep[gstage_pte_index(addr, current_level)]; while (ptep && pte_val(*ptep)) { - if (stage2_pte_leaf(ptep)) { + if (gstage_pte_leaf(ptep)) { *ptep_level = current_level; *ptepp = ptep; return true; @@ -102,8 +113,8 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, if (current_level) { current_level--; *ptep_level = current_level; - ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); - ptep = &ptep[stage2_pte_index(addr, current_level)]; + ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); + ptep = &ptep[gstage_pte_index(addr, current_level)]; } else { ptep = NULL; } @@ -112,38 +123,30 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, return false; } -static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) +static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) { - unsigned long size = PAGE_SIZE; - struct kvm_vmid *vmid = &kvm->arch.vmid; + unsigned long order = PAGE_SHIFT; - if (stage2_level_to_page_size(level, &size)) + if (gstage_level_to_page_order(level, &order)) return; - addr &= ~(size - 1); + addr &= ~(BIT(order) - 1); - /* - * TODO: Instead of cpu_online_mask, we should only target CPUs - * where the Guest/VM is running. - */ - preempt_disable(); - sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size, - READ_ONCE(vmid->vmid)); - preempt_enable(); + kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order); } -static int stage2_set_pte(struct kvm *kvm, u32 level, +static int gstage_set_pte(struct kvm *kvm, u32 level, struct kvm_mmu_memory_cache *pcache, gpa_t addr, const pte_t *new_pte) { - u32 current_level = stage2_pgd_levels - 1; + u32 current_level = gstage_pgd_levels - 1; pte_t *next_ptep = (pte_t *)kvm->arch.pgd; - pte_t *ptep = &next_ptep[stage2_pte_index(addr, current_level)]; + pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)]; if (current_level < level) return -EINVAL; while (current_level != level) { - if (stage2_pte_leaf(ptep)) + if (gstage_pte_leaf(ptep)) return -EEXIST; if (!pte_val(*ptep)) { @@ -155,23 +158,23 @@ static int stage2_set_pte(struct kvm *kvm, u32 level, *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)), __pgprot(_PAGE_TABLE)); } else { - if (stage2_pte_leaf(ptep)) + if (gstage_pte_leaf(ptep)) return -EEXIST; - next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); } current_level--; - ptep = &next_ptep[stage2_pte_index(addr, current_level)]; + ptep = &next_ptep[gstage_pte_index(addr, current_level)]; } *ptep = *new_pte; - if (stage2_pte_leaf(ptep)) - stage2_remote_tlb_flush(kvm, current_level, addr); + if (gstage_pte_leaf(ptep)) + gstage_remote_tlb_flush(kvm, current_level, addr); return 0; } -static int stage2_map_page(struct kvm *kvm, +static int gstage_map_page(struct kvm *kvm, struct kvm_mmu_memory_cache *pcache, gpa_t gpa, phys_addr_t hpa, unsigned long page_size, @@ -182,7 +185,7 @@ static int stage2_map_page(struct kvm *kvm, pte_t new_pte; pgprot_t prot; - ret = stage2_page_size_to_level(page_size, &level); + ret = gstage_page_size_to_level(page_size, &level); if (ret) return ret; @@ -193,9 +196,9 @@ static int stage2_map_page(struct kvm *kvm, * PTE so that software can update these bits. * * We support both options mentioned above. To achieve this, we - * always set 'A' and 'D' PTE bits at time of creating stage2 + * always set 'A' and 'D' PTE bits at time of creating G-stage * mapping. To support KVM dirty page logging with both options - * mentioned above, we will write-protect stage2 PTEs to track + * mentioned above, we will write-protect G-stage PTEs to track * dirty pages. */ @@ -213,24 +216,24 @@ static int stage2_map_page(struct kvm *kvm, new_pte = pfn_pte(PFN_DOWN(hpa), prot); new_pte = pte_mkdirty(new_pte); - return stage2_set_pte(kvm, level, pcache, gpa, &new_pte); + return gstage_set_pte(kvm, level, pcache, gpa, &new_pte); } -enum stage2_op { - STAGE2_OP_NOP = 0, /* Nothing */ - STAGE2_OP_CLEAR, /* Clear/Unmap */ - STAGE2_OP_WP, /* Write-protect */ +enum gstage_op { + GSTAGE_OP_NOP = 0, /* Nothing */ + GSTAGE_OP_CLEAR, /* Clear/Unmap */ + GSTAGE_OP_WP, /* Write-protect */ }; -static void stage2_op_pte(struct kvm *kvm, gpa_t addr, - pte_t *ptep, u32 ptep_level, enum stage2_op op) +static void gstage_op_pte(struct kvm *kvm, gpa_t addr, + pte_t *ptep, u32 ptep_level, enum gstage_op op) { int i, ret; pte_t *next_ptep; u32 next_ptep_level; unsigned long next_page_size, page_size; - ret = stage2_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(ptep_level, &page_size); if (ret) return; @@ -239,31 +242,31 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t addr, if (!pte_val(*ptep)) return; - if (ptep_level && !stage2_pte_leaf(ptep)) { - next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + if (ptep_level && !gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); next_ptep_level = ptep_level - 1; - ret = stage2_level_to_page_size(next_ptep_level, + ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); if (ret) return; - if (op == STAGE2_OP_CLEAR) + if (op == GSTAGE_OP_CLEAR) set_pte(ptep, __pte(0)); for (i = 0; i < PTRS_PER_PTE; i++) - stage2_op_pte(kvm, addr + i * next_page_size, + gstage_op_pte(kvm, addr + i * next_page_size, &next_ptep[i], next_ptep_level, op); - if (op == STAGE2_OP_CLEAR) + if (op == GSTAGE_OP_CLEAR) put_page(virt_to_page(next_ptep)); } else { - if (op == STAGE2_OP_CLEAR) + if (op == GSTAGE_OP_CLEAR) set_pte(ptep, __pte(0)); - else if (op == STAGE2_OP_WP) + else if (op == GSTAGE_OP_WP) set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE)); - stage2_remote_tlb_flush(kvm, ptep_level, addr); + gstage_remote_tlb_flush(kvm, ptep_level, addr); } } -static void stage2_unmap_range(struct kvm *kvm, gpa_t start, +static void gstage_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size, bool may_block) { int ret; @@ -274,9 +277,9 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t addr = start, end = start + size; while (addr < end) { - found_leaf = stage2_get_leaf_entry(kvm, addr, + found_leaf = gstage_get_leaf_entry(kvm, addr, &ptep, &ptep_level); - ret = stage2_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(ptep_level, &page_size); if (ret) break; @@ -284,8 +287,8 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start, goto next; if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - stage2_op_pte(kvm, addr, ptep, - ptep_level, STAGE2_OP_CLEAR); + gstage_op_pte(kvm, addr, ptep, + ptep_level, GSTAGE_OP_CLEAR); next: addr += page_size; @@ -299,7 +302,7 @@ next: } } -static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) +static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) { int ret; pte_t *ptep; @@ -309,9 +312,9 @@ static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) unsigned long page_size; while (addr < end) { - found_leaf = stage2_get_leaf_entry(kvm, addr, + found_leaf = gstage_get_leaf_entry(kvm, addr, &ptep, &ptep_level); - ret = stage2_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(ptep_level, &page_size); if (ret) break; @@ -319,15 +322,15 @@ static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) goto next; if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - stage2_op_pte(kvm, addr, ptep, - ptep_level, STAGE2_OP_WP); + gstage_op_pte(kvm, addr, ptep, + ptep_level, GSTAGE_OP_WP); next: addr += page_size; } } -static void stage2_wp_memory_region(struct kvm *kvm, int slot) +static void gstage_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); @@ -335,12 +338,12 @@ static void stage2_wp_memory_region(struct kvm *kvm, int slot) phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); - stage2_wp_range(kvm, start, end); + gstage_wp_range(kvm, start, end); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, +static int gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, unsigned long size, bool writable) { pte_t pte; @@ -361,12 +364,12 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, if (!writable) pte = pte_wrprotect(pte); - ret = kvm_mmu_topup_memory_cache(&pcache, stage2_pgd_levels); + ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels); if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, 0, &pcache, addr, &pte); + ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -388,7 +391,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; - stage2_wp_range(kvm, start, end); + gstage_wp_range(kvm, start, end); } void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -411,7 +414,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_riscv_stage2_free_pgd(kvm); + kvm_riscv_gstage_free_pgd(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -421,7 +424,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); - stage2_unmap_range(kvm, gpa, size, false); + gstage_unmap_range(kvm, gpa, size, false); spin_unlock(&kvm->mmu_lock); } @@ -436,7 +439,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * the memory slot is write protected. */ if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) - stage2_wp_memory_region(kvm, new->id); + gstage_wp_memory_region(kvm, new->id); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -458,7 +461,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest GPA space. */ if ((new->base_gfn + new->npages) >= - (stage2_gpa_size >> PAGE_SHIFT)) + (gstage_gpa_size >> PAGE_SHIFT)) return -EFAULT; hva = new->userspace_addr; @@ -514,7 +517,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, goto out; } - ret = stage2_ioremap(kvm, gpa, pa, + ret = gstage_ioremap(kvm, gpa, pa, vm_end - vm_start, writable); if (ret) break; @@ -527,7 +530,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, spin_lock(&kvm->mmu_lock); if (ret) - stage2_unmap_range(kvm, base_gpa, size, false); + gstage_unmap_range(kvm, base_gpa, size, false); spin_unlock(&kvm->mmu_lock); out: @@ -540,7 +543,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.pgd) return false; - stage2_unmap_range(kvm, range->start << PAGE_SHIFT, + gstage_unmap_range(kvm, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); return false; @@ -556,10 +559,10 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(range->end - range->start != 1); - ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT, + ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT, __pfn_to_phys(pfn), PAGE_SIZE, true, true); if (ret) { - kvm_debug("Failed to map stage2 page (error %d)\n", ret); + kvm_debug("Failed to map G-stage page (error %d)\n", ret); return true; } @@ -577,7 +580,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); - if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; @@ -595,14 +598,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); - if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; return pte_young(*ptep); } -int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, +int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write) { @@ -648,9 +651,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, } /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(pcache, stage2_pgd_levels); + ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); if (ret) { - kvm_err("Failed to topup stage2 cache\n"); + kvm_err("Failed to topup G-stage cache\n"); return ret; } @@ -680,15 +683,15 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, if (writeable) { kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); - ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, + ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, false, true); } else { - ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, + ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, true, true); } if (ret) - kvm_err("Failed to map in stage2\n"); + kvm_err("Failed to map in G-stage\n"); out_unlock: spin_unlock(&kvm->mmu_lock); @@ -697,7 +700,7 @@ out_unlock: return ret; } -int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) +int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) { struct page *pgd_page; @@ -707,7 +710,7 @@ int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) } pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - get_order(stage2_pgd_size)); + get_order(gstage_pgd_size)); if (!pgd_page) return -ENOMEM; kvm->arch.pgd = page_to_virt(pgd_page); @@ -716,13 +719,13 @@ int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) return 0; } -void kvm_riscv_stage2_free_pgd(struct kvm *kvm) +void kvm_riscv_gstage_free_pgd(struct kvm *kvm) { void *pgd = NULL; spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - stage2_unmap_range(kvm, 0UL, stage2_gpa_size, false); + gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; kvm->arch.pgd_phys = 0; @@ -730,12 +733,12 @@ void kvm_riscv_stage2_free_pgd(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); if (pgd) - free_pages((unsigned long)pgd, get_order(stage2_pgd_size)); + free_pages((unsigned long)pgd, get_order(gstage_pgd_size)); } -void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu) +void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) { - unsigned long hgatp = stage2_mode; + unsigned long hgatp = gstage_mode; struct kvm_arch *k = &vcpu->kvm->arch; hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & @@ -744,31 +747,40 @@ void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu) csr_write(CSR_HGATP, hgatp); - if (!kvm_riscv_stage2_vmid_bits()) - __kvm_riscv_hfence_gvma_all(); + if (!kvm_riscv_gstage_vmid_bits()) + kvm_riscv_local_hfence_gvma_all(); } -void kvm_riscv_stage2_mode_detect(void) +void kvm_riscv_gstage_mode_detect(void) { #ifdef CONFIG_64BIT - /* Try Sv48x4 stage2 mode */ + /* Try Sv57x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { + gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); + gstage_pgd_levels = 5; + goto skip_sv48x4_test; + } + + /* Try Sv48x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { - stage2_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); - stage2_pgd_levels = 4; + gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); + gstage_pgd_levels = 4; } - csr_write(CSR_HGATP, 0); +skip_sv48x4_test: - __kvm_riscv_hfence_gvma_all(); + csr_write(CSR_HGATP, 0); + kvm_riscv_local_hfence_gvma_all(); #endif } -unsigned long kvm_riscv_stage2_mode(void) +unsigned long kvm_riscv_gstage_mode(void) { - return stage2_mode >> HGATP_MODE_SHIFT; + return gstage_mode >> HGATP_MODE_SHIFT; } -int kvm_riscv_stage2_gpa_bits(void) +int kvm_riscv_gstage_gpa_bits(void) { - return stage2_gpa_bits; + return gstage_gpa_bits; } diff --git a/arch/riscv/kvm/tlb.S b/arch/riscv/kvm/tlb.S deleted file mode 100644 index 899f75d60bad..000000000000 --- a/arch/riscv/kvm/tlb.S +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2019 Western Digital Corporation or its affiliates. - * - * Authors: - * Anup Patel <anup.patel@wdc.com> - */ - -#include <linux/linkage.h> -#include <asm/asm.h> - - .text - .altmacro - .option norelax - - /* - * Instruction encoding of hfence.gvma is: - * HFENCE.GVMA rs1, rs2 - * HFENCE.GVMA zero, rs2 - * HFENCE.GVMA rs1 - * HFENCE.GVMA - * - * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2 - * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2 - * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1 - * rs1==zero and rs2==zero ==> HFENCE.GVMA - * - * Instruction encoding of HFENCE.GVMA is: - * 0110001 rs2(5) rs1(5) 000 00000 1110011 - */ - -ENTRY(__kvm_riscv_hfence_gvma_vmid_gpa) - /* - * rs1 = a0 (GPA >> 2) - * rs2 = a1 (VMID) - * HFENCE.GVMA a0, a1 - * 0110001 01011 01010 000 00000 1110011 - */ - .word 0x62b50073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_vmid_gpa) - -ENTRY(__kvm_riscv_hfence_gvma_vmid) - /* - * rs1 = zero - * rs2 = a0 (VMID) - * HFENCE.GVMA zero, a0 - * 0110001 01010 00000 000 00000 1110011 - */ - .word 0x62a00073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_vmid) - -ENTRY(__kvm_riscv_hfence_gvma_gpa) - /* - * rs1 = a0 (GPA >> 2) - * rs2 = zero - * HFENCE.GVMA a0 - * 0110001 00000 01010 000 00000 1110011 - */ - .word 0x62050073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_gpa) - -ENTRY(__kvm_riscv_hfence_gvma_all) - /* - * rs1 = zero - * rs2 = zero - * HFENCE.GVMA - * 0110001 00000 00000 000 00000 1110011 - */ - .word 0x62000073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_all) diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c new file mode 100644 index 000000000000..1a76d0b1907d --- /dev/null +++ b/arch/riscv/kvm/tlb.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#include <linux/bitmap.h> +#include <linux/cpumask.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/kvm_host.h> +#include <asm/cacheflush.h> +#include <asm/csr.h> + +/* + * Instruction encoding of hfence.gvma is: + * HFENCE.GVMA rs1, rs2 + * HFENCE.GVMA zero, rs2 + * HFENCE.GVMA rs1 + * HFENCE.GVMA + * + * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2 + * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2 + * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1 + * rs1==zero and rs2==zero ==> HFENCE.GVMA + * + * Instruction encoding of HFENCE.GVMA is: + * 0110001 rs2(5) rs1(5) 000 00000 1110011 + */ + +void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, + gpa_t gpa, gpa_t gpsz, + unsigned long order) +{ + gpa_t pos; + + if (PTRS_PER_PTE < (gpsz >> order)) { + kvm_riscv_local_hfence_gvma_vmid_all(vmid); + return; + } + + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) { + /* + * rs1 = a0 (GPA >> 2) + * rs2 = a1 (VMID) + * HFENCE.GVMA a0, a1 + * 0110001 01011 01010 000 00000 1110011 + */ + asm volatile ("srli a0, %0, 2\n" + "add a1, %1, zero\n" + ".word 0x62b50073\n" + :: "r" (pos), "r" (vmid) + : "a0", "a1", "memory"); + } +} + +void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid) +{ + /* + * rs1 = zero + * rs2 = a0 (VMID) + * HFENCE.GVMA zero, a0 + * 0110001 01010 00000 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + ".word 0x62a00073\n" + :: "r" (vmid) : "a0", "memory"); +} + +void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, + unsigned long order) +{ + gpa_t pos; + + if (PTRS_PER_PTE < (gpsz >> order)) { + kvm_riscv_local_hfence_gvma_all(); + return; + } + + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) { + /* + * rs1 = a0 (GPA >> 2) + * rs2 = zero + * HFENCE.GVMA a0 + * 0110001 00000 01010 000 00000 1110011 + */ + asm volatile ("srli a0, %0, 2\n" + ".word 0x62050073\n" + :: "r" (pos) : "a0", "memory"); + } +} + +void kvm_riscv_local_hfence_gvma_all(void) +{ + /* + * rs1 = zero + * rs2 = zero + * HFENCE.GVMA + * 0110001 00000 00000 000 00000 1110011 + */ + asm volatile (".word 0x62000073" ::: "memory"); +} + +/* + * Instruction encoding of hfence.gvma is: + * HFENCE.VVMA rs1, rs2 + * HFENCE.VVMA zero, rs2 + * HFENCE.VVMA rs1 + * HFENCE.VVMA + * + * rs1!=zero and rs2!=zero ==> HFENCE.VVMA rs1, rs2 + * rs1==zero and rs2!=zero ==> HFENCE.VVMA zero, rs2 + * rs1!=zero and rs2==zero ==> HFENCE.VVMA rs1 + * rs1==zero and rs2==zero ==> HFENCE.VVMA + * + * Instruction encoding of HFENCE.VVMA is: + * 0010001 rs2(5) rs1(5) 000 00000 1110011 + */ + +void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, + unsigned long asid, + unsigned long gva, + unsigned long gvsz, + unsigned long order) +{ + unsigned long pos, hgatp; + + if (PTRS_PER_PTE < (gvsz >> order)) { + kvm_riscv_local_hfence_vvma_asid_all(vmid, asid); + return; + } + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) { + /* + * rs1 = a0 (GVA) + * rs2 = a1 (ASID) + * HFENCE.VVMA a0, a1 + * 0010001 01011 01010 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + "add a1, %1, zero\n" + ".word 0x22b50073\n" + :: "r" (pos), "r" (asid) + : "a0", "a1", "memory"); + } + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, + unsigned long asid) +{ + unsigned long hgatp; + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + /* + * rs1 = zero + * rs2 = a0 (ASID) + * HFENCE.VVMA zero, a0 + * 0010001 01010 00000 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + ".word 0x22a00073\n" + :: "r" (asid) : "a0", "memory"); + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, + unsigned long gva, unsigned long gvsz, + unsigned long order) +{ + unsigned long pos, hgatp; + + if (PTRS_PER_PTE < (gvsz >> order)) { + kvm_riscv_local_hfence_vvma_all(vmid); + return; + } + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) { + /* + * rs1 = a0 (GVA) + * rs2 = zero + * HFENCE.VVMA a0 + * 0010001 00000 01010 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + ".word 0x22050073\n" + :: "r" (pos) : "a0", "memory"); + } + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_hfence_vvma_all(unsigned long vmid) +{ + unsigned long hgatp; + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + /* + * rs1 = zero + * rs2 = zero + * HFENCE.VVMA + * 0010001 00000 00000 000 00000 1110011 + */ + asm volatile (".word 0x22000073" ::: "memory"); + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu) +{ + unsigned long vmid; + + if (!kvm_riscv_gstage_vmid_bits() || + vcpu->arch.last_exit_cpu == vcpu->cpu) + return; + + /* + * On RISC-V platforms with hardware VMID support, we share same + * VMID for all VCPUs of a particular Guest/VM. This means we might + * have stale G-stage TLB entries on the current Host CPU due to + * some other VCPU of the same Guest which ran previously on the + * current Host CPU. + * + * To cleanup stale TLB entries, we simply flush all G-stage TLB + * entries by VMID whenever underlying Host CPU changes for a VCPU. + */ + + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); + kvm_riscv_local_hfence_gvma_vmid_all(vmid); +} + +void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) +{ + local_flush_icache_all(); +} + +void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu) +{ + struct kvm_vmid *vmid; + + vmid = &vcpu->kvm->arch.vmid; + kvm_riscv_local_hfence_gvma_vmid_all(READ_ONCE(vmid->vmid)); +} + +void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu) +{ + struct kvm_vmid *vmid; + + vmid = &vcpu->kvm->arch.vmid; + kvm_riscv_local_hfence_vvma_all(READ_ONCE(vmid->vmid)); +} + +static bool vcpu_hfence_dequeue(struct kvm_vcpu *vcpu, + struct kvm_riscv_hfence *out_data) +{ + bool ret = false; + struct kvm_vcpu_arch *varch = &vcpu->arch; + + spin_lock(&varch->hfence_lock); + + if (varch->hfence_queue[varch->hfence_head].type) { + memcpy(out_data, &varch->hfence_queue[varch->hfence_head], + sizeof(*out_data)); + varch->hfence_queue[varch->hfence_head].type = 0; + + varch->hfence_head++; + if (varch->hfence_head == KVM_RISCV_VCPU_MAX_HFENCE) + varch->hfence_head = 0; + + ret = true; + } + + spin_unlock(&varch->hfence_lock); + + return ret; +} + +static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu, + const struct kvm_riscv_hfence *data) +{ + bool ret = false; + struct kvm_vcpu_arch *varch = &vcpu->arch; + + spin_lock(&varch->hfence_lock); + + if (!varch->hfence_queue[varch->hfence_tail].type) { + memcpy(&varch->hfence_queue[varch->hfence_tail], + data, sizeof(*data)); + + varch->hfence_tail++; + if (varch->hfence_tail == KVM_RISCV_VCPU_MAX_HFENCE) + varch->hfence_tail = 0; + + ret = true; + } + + spin_unlock(&varch->hfence_lock); + + return ret; +} + +void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) +{ + struct kvm_riscv_hfence d = { 0 }; + struct kvm_vmid *v = &vcpu->kvm->arch.vmid; + + while (vcpu_hfence_dequeue(vcpu, &d)) { + switch (d.type) { + case KVM_RISCV_HFENCE_UNKNOWN: + break; + case KVM_RISCV_HFENCE_GVMA_VMID_GPA: + kvm_riscv_local_hfence_gvma_vmid_gpa( + READ_ONCE(v->vmid), + d.addr, d.size, d.order); + break; + case KVM_RISCV_HFENCE_VVMA_ASID_GVA: + kvm_riscv_local_hfence_vvma_asid_gva( + READ_ONCE(v->vmid), d.asid, + d.addr, d.size, d.order); + break; + case KVM_RISCV_HFENCE_VVMA_ASID_ALL: + kvm_riscv_local_hfence_vvma_asid_all( + READ_ONCE(v->vmid), d.asid); + break; + case KVM_RISCV_HFENCE_VVMA_GVA: + kvm_riscv_local_hfence_vvma_gva( + READ_ONCE(v->vmid), + d.addr, d.size, d.order); + break; + default: + break; + } + } +} + +static void make_xfence_request(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned int req, unsigned int fallback_req, + const struct kvm_riscv_hfence *data) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + unsigned int actual_req = req; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); + + bitmap_clear(vcpu_mask, 0, KVM_MAX_VCPUS); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (hbase != -1UL) { + if (vcpu->vcpu_id < hbase) + continue; + if (!(hmask & (1UL << (vcpu->vcpu_id - hbase)))) + continue; + } + + bitmap_set(vcpu_mask, i, 1); + + if (!data || !data->type) + continue; + + /* + * Enqueue hfence data to VCPU hfence queue. If we don't + * have space in the VCPU hfence queue then fallback to + * a more conservative hfence request. + */ + if (!vcpu_hfence_enqueue(vcpu, data)) + actual_req = fallback_req; + } + + kvm_make_vcpus_request_mask(kvm, actual_req, vcpu_mask); +} + +void kvm_riscv_fence_i(struct kvm *kvm, + unsigned long hbase, unsigned long hmask) +{ + make_xfence_request(kvm, hbase, hmask, KVM_REQ_FENCE_I, + KVM_REQ_FENCE_I, NULL); +} + +void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + gpa_t gpa, gpa_t gpsz, + unsigned long order) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA; + data.asid = 0; + data.addr = gpa; + data.size = gpsz; + data.order = order; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_GVMA_VMID_ALL, &data); +} + +void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask) +{ + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL, + KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL); +} + +void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order, unsigned long asid) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA; + data.asid = asid; + data.addr = gva; + data.size = gvsz; + data.order = order; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long asid) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL; + data.asid = asid; + data.addr = data.size = data.order = 0; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_VVMA_GVA; + data.asid = 0; + data.addr = gva; + data.size = gvsz; + data.order = order; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +void kvm_riscv_hfence_vvma_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask) +{ + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL, + KVM_REQ_HFENCE_VVMA_ALL, NULL); +} diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 7461f964d20a..7f4ad5e4373a 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -67,6 +67,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) if (loaded) kvm_arch_vcpu_put(vcpu); + vcpu->arch.last_exit_cpu = -1; + memcpy(csr, reset_csr, sizeof(*csr)); memcpy(cntx, reset_cntx, sizeof(*cntx)); @@ -78,6 +80,10 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) WRITE_ONCE(vcpu->arch.irqs_pending, 0); WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); + vcpu->arch.hfence_head = 0; + vcpu->arch.hfence_tail = 0; + memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue)); + /* Reset the guest CSRs for hotplug usecase */ if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); @@ -101,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Setup ISA features available to VCPU */ vcpu->arch.isa = riscv_isa_extension_base(NULL) & KVM_RISCV_ISA_ALLOWED; + /* Setup VCPU hfence queue */ + spin_lock_init(&vcpu->arch.hfence_lock); + /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ cntx = &vcpu->arch.guest_reset_context; cntx->sstatus = SR_SPP | SR_SPIE; @@ -137,7 +146,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) /* Cleanup VCPU timer */ kvm_riscv_vcpu_timer_deinit(vcpu); - /* Free unused pages pre-allocated for Stage2 page table mappings */ + /* Free unused pages pre-allocated for G-stage page table mappings */ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } @@ -365,6 +374,101 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, return 0; } +/* Mapping between KVM ISA Extension ID & Host ISA extension ID */ +static unsigned long kvm_isa_ext_arr[] = { + RISCV_ISA_EXT_a, + RISCV_ISA_EXT_c, + RISCV_ISA_EXT_d, + RISCV_ISA_EXT_f, + RISCV_ISA_EXT_h, + RISCV_ISA_EXT_i, + RISCV_ISA_EXT_m, +}; + +static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_ISA_EXT); + unsigned long reg_val = 0; + unsigned long host_isa_ext; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) + return -EINVAL; + + host_isa_ext = kvm_isa_ext_arr[reg_num]; + if (__riscv_isa_extension_available(&vcpu->arch.isa, host_isa_ext)) + reg_val = 1; /* Mark the given extension as available */ + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_ISA_EXT); + unsigned long reg_val; + unsigned long host_isa_ext; + unsigned long host_isa_ext_mask; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + host_isa_ext = kvm_isa_ext_arr[reg_num]; + if (!__riscv_isa_extension_available(NULL, host_isa_ext)) + return -EOPNOTSUPP; + + if (host_isa_ext >= RISCV_ISA_EXT_BASE && + host_isa_ext < RISCV_ISA_EXT_MAX) { + /* + * Multi-letter ISA extension. Currently there is no provision + * to enable/disable the multi-letter ISA extensions for guests. + * Return success if the request is to enable any ISA extension + * that is available in the hardware. + * Return -EOPNOTSUPP otherwise. + */ + if (!reg_val) + return -EOPNOTSUPP; + else + return 0; + } + + /* Single letter base ISA extension */ + if (!vcpu->arch.ran_atleast_once) { + host_isa_ext_mask = BIT_MASK(host_isa_ext); + if (!reg_val && (host_isa_ext_mask & KVM_RISCV_ISA_DISABLE_ALLOWED)) + vcpu->arch.isa &= ~host_isa_ext_mask; + else + vcpu->arch.isa |= host_isa_ext_mask; + vcpu->arch.isa &= riscv_isa_extension_base(NULL); + vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED; + kvm_riscv_vcpu_fp_reset(vcpu); + } else { + return -EOPNOTSUPP; + } + + return 0; +} + static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { @@ -382,6 +486,8 @@ static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D) return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, KVM_REG_RISCV_FP_D); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT) + return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg); return -EINVAL; } @@ -403,6 +509,8 @@ static int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu, else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D) return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, KVM_REG_RISCV_FP_D); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT) + return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg); return -EINVAL; } @@ -635,7 +743,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); - kvm_riscv_stage2_update_hgatp(vcpu); + kvm_riscv_gstage_update_hgatp(vcpu); kvm_riscv_vcpu_timer_restore(vcpu); @@ -690,10 +798,23 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) kvm_riscv_reset_vcpu(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu)) - kvm_riscv_stage2_update_hgatp(vcpu); + kvm_riscv_gstage_update_hgatp(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - __kvm_riscv_hfence_gvma_all(); + if (kvm_check_request(KVM_REQ_FENCE_I, vcpu)) + kvm_riscv_fence_i_process(vcpu); + + /* + * The generic KVM_REQ_TLB_FLUSH is same as + * KVM_REQ_HFENCE_GVMA_VMID_ALL + */ + if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu)) + kvm_riscv_hfence_gvma_vmid_all_process(vcpu); + + if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu)) + kvm_riscv_hfence_vvma_all_process(vcpu); + + if (kvm_check_request(KVM_REQ_HFENCE, vcpu)) + kvm_riscv_hfence_process(vcpu); } } @@ -715,6 +836,7 @@ static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) { guest_state_enter_irqoff(); __kvm_riscv_switch_to(&vcpu->arch); + vcpu->arch.last_exit_cpu = vcpu->cpu; guest_state_exit_irqoff(); } @@ -762,7 +884,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Check conditions before entering the guest */ cond_resched(); - kvm_riscv_stage2_vmid_update(vcpu); + kvm_riscv_gstage_vmid_update(vcpu); kvm_riscv_check_vcpu_requests(vcpu); @@ -800,7 +922,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_riscv_update_hvip(vcpu); if (ret <= 0 || - kvm_riscv_stage2_vmid_ver_changed(&vcpu->kvm->arch.vmid) || + kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); @@ -809,6 +931,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } + /* + * Cleanup stale TLB enteries + * + * Note: This should be done after G-stage VMID has been + * updated using kvm_riscv_gstage_vmid_ver_changed() + */ + kvm_riscv_local_tlb_sanitize(vcpu); + guest_timing_enter_irqoff(); kvm_riscv_vcpu_enter_exit(vcpu); diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index a72c15d4b42a..dbb09afd7546 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -412,7 +412,7 @@ static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run, return 0; } -static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, +static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) { struct kvm_memory_slot *memslot; @@ -440,7 +440,7 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, }; } - ret = kvm_riscv_stage2_map(vcpu, memslot, fault_addr, hva, + ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva, (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); if (ret < 0) return ret; @@ -686,7 +686,7 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case EXC_LOAD_GUEST_PAGE_FAULT: case EXC_STORE_GUEST_PAGE_FAULT: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) - ret = stage2_page_fault(vcpu, run, trap); + ret = gstage_page_fault(vcpu, run, trap); break; case EXC_SUPERVISOR_SYSCALL: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 0f217365c287..4c034d8a606a 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -81,43 +81,41 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run struct kvm_cpu_trap *utrap, bool *exit) { int ret = 0; - unsigned long i; - struct cpumask cm; - struct kvm_vcpu *tmp; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; unsigned long funcid = cp->a6; - cpumask_clear(&cm); - kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - if (hbase != -1UL) { - if (tmp->vcpu_id < hbase) - continue; - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) - continue; - } - if (tmp->cpu < 0) - continue; - cpumask_set_cpu(tmp->cpu, &cm); - } - switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_remote_fence_i(&cm); + kvm_riscv_fence_i(vcpu->kvm, hbase, hmask); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3); + if (cp->a2 == 0 && cp->a3 == 0) + kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); + else + kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, + cp->a2, cp->a3, PAGE_SHIFT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2, - cp->a3, cp->a4); + if (cp->a2 == 0 && cp->a3 == 0) + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, + hbase, hmask, cp->a4); + else + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, + hbase, hmask, + cp->a2, cp->a3, + PAGE_SHIFT, cp->a4); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA: case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: - /* TODO: implement for nested hypervisor case */ + /* + * Until nested virtualization is implemented, the + * SBI HFENCE calls should be treated as NOPs + */ + break; default: ret = -EOPNOTSUPP; } diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index da4d6c99c2cf..8a91a14e7139 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -23,7 +23,6 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, int i, ret = 0; u64 next_cycle; struct kvm_vcpu *rvcpu; - struct cpumask cm; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -80,19 +79,29 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (utrap->scause) break; - cpumask_clear(&cm); - for_each_set_bit(i, &hmask, BITS_PER_LONG) { - rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i); - if (rvcpu->cpu < 0) - continue; - cpumask_set_cpu(rvcpu->cpu, &cm); - } if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) - ret = sbi_remote_fence_i(&cm); - else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) - ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2); - else - ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3); + kvm_riscv_fence_i(vcpu->kvm, 0, hmask); + else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) { + if (cp->a1 == 0 && cp->a2 == 0) + kvm_riscv_hfence_vvma_all(vcpu->kvm, + 0, hmask); + else + kvm_riscv_hfence_vvma_gva(vcpu->kvm, + 0, hmask, + cp->a1, cp->a2, + PAGE_SHIFT); + } else { + if (cp->a1 == 0 && cp->a2 == 0) + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, + 0, hmask, + cp->a3); + else + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, + 0, hmask, + cp->a1, cp->a2, + PAGE_SHIFT, + cp->a3); + } break; default: ret = -EINVAL; diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index c768f75279ef..945a2bf5e3f6 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -31,13 +31,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int r; - r = kvm_riscv_stage2_alloc_pgd(kvm); + r = kvm_riscv_gstage_alloc_pgd(kvm); if (r) return r; - r = kvm_riscv_stage2_vmid_init(kvm); + r = kvm_riscv_gstage_vmid_init(kvm); if (r) { - kvm_riscv_stage2_free_pgd(kvm); + kvm_riscv_gstage_free_pgd(kvm); return r; } @@ -75,7 +75,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_VM_GPA_BITS: - r = kvm_riscv_stage2_gpa_bits(); + r = kvm_riscv_gstage_gpa_bits(); break; default: r = 0; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 2fa4f7b1813d..9f764df125db 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -11,16 +11,16 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/smp.h> #include <linux/kvm_host.h> #include <asm/csr.h> -#include <asm/sbi.h> static unsigned long vmid_version = 1; static unsigned long vmid_next; static unsigned long vmid_bits; static DEFINE_SPINLOCK(vmid_lock); -void kvm_riscv_stage2_vmid_detect(void) +void kvm_riscv_gstage_vmid_detect(void) { unsigned long old; @@ -33,19 +33,19 @@ void kvm_riscv_stage2_vmid_detect(void) csr_write(CSR_HGATP, old); /* We polluted local TLB so flush all guest TLB */ - __kvm_riscv_hfence_gvma_all(); + kvm_riscv_local_hfence_gvma_all(); /* We don't use VMID bits if they are not sufficient */ if ((1UL << vmid_bits) < num_possible_cpus()) vmid_bits = 0; } -unsigned long kvm_riscv_stage2_vmid_bits(void) +unsigned long kvm_riscv_gstage_vmid_bits(void) { return vmid_bits; } -int kvm_riscv_stage2_vmid_init(struct kvm *kvm) +int kvm_riscv_gstage_vmid_init(struct kvm *kvm) { /* Mark the initial VMID and VMID version invalid */ kvm->arch.vmid.vmid_version = 0; @@ -54,7 +54,7 @@ int kvm_riscv_stage2_vmid_init(struct kvm *kvm) return 0; } -bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid) +bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid) { if (!vmid_bits) return false; @@ -63,13 +63,18 @@ bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid) READ_ONCE(vmid_version)); } -void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) +static void __local_hfence_gvma_all(void *info) +{ + kvm_riscv_local_hfence_gvma_all(); +} + +void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu) { unsigned long i; struct kvm_vcpu *v; struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid; - if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) + if (!kvm_riscv_gstage_vmid_ver_changed(vmid)) return; spin_lock(&vmid_lock); @@ -78,7 +83,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * We need to re-check the vmid_version here to ensure that if * another vcpu already allocated a valid vmid for this vm. */ - if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) { + if (!kvm_riscv_gstage_vmid_ver_changed(vmid)) { spin_unlock(&vmid_lock); return; } @@ -96,12 +101,13 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * instances is invalid and we have force VMID re-assignement * for all Guest instances. The Guest instances that were not * running will automatically pick-up new VMIDs because will - * call kvm_riscv_stage2_vmid_update() whenever they enter + * call kvm_riscv_gstage_vmid_update() whenever they enter * in-kernel run loop. For Guest instances that are already * running, we force VM exits on all host CPUs using IPI and * flush all Guest TLBs. */ - sbi_remote_hfence_gvma(cpu_online_mask, 0, 0); + on_each_cpu_mask(cpu_online_mask, __local_hfence_gvma_all, + NULL, 1); } vmid->vmid = vmid_next; @@ -112,7 +118,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) spin_unlock(&vmid_lock); - /* Request stage2 page table update for all VCPUs */ + /* Request G-stage page table update for all VCPUs */ kvm_for_each_vcpu(i, v, vcpu->kvm) kvm_make_request(KVM_REQ_UPDATE_HGATP, v); } diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index a2d376b8bce3..cfea7b77a5b8 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -2,7 +2,7 @@ /* * Ultravisor Interfaces * - * Copyright IBM Corp. 2019 + * Copyright IBM Corp. 2019, 2022 * * Author(s): * Vasily Gorbik <gor@linux.ibm.com> @@ -52,6 +52,7 @@ #define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 +#define UVC_CMD_RETR_ATTEST 0x1020 /* Bits in installed uv calls */ enum uv_cmds_inst { @@ -76,6 +77,7 @@ enum uv_cmds_inst { BIT_UVC_CMD_UNSHARE_ALL = 20, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_RETR_ATTEST = 28, }; enum uv_feat_ind { @@ -219,6 +221,25 @@ struct uv_cb_share { u64 reserved28; } __packed __aligned(8); +/* Retrieve Attestation Measurement */ +struct uv_cb_attest { + struct uv_cb_header header; /* 0x0000 */ + u64 reserved08[2]; /* 0x0008 */ + u64 arcb_addr; /* 0x0018 */ + u64 cont_token; /* 0x0020 */ + u8 reserved28[6]; /* 0x0028 */ + u16 user_data_len; /* 0x002e */ + u8 user_data[256]; /* 0x0030 */ + u32 reserved130[3]; /* 0x0130 */ + u32 meas_len; /* 0x013c */ + u64 meas_addr; /* 0x0140 */ + u8 config_uid[16]; /* 0x0148 */ + u32 reserved158; /* 0x0158 */ + u32 add_data_len; /* 0x015c */ + u64 add_data_addr; /* 0x0160 */ + u64 reserved168[4]; /* 0x0168 */ +} __packed __aligned(8); + static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h new file mode 100644 index 000000000000..10a5ac918e02 --- /dev/null +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2022 + * Author(s): Steffen Eiden <seiden@linux.ibm.com> + */ +#ifndef __S390_ASM_UVDEVICE_H +#define __S390_ASM_UVDEVICE_H + +#include <linux/types.h> + +struct uvio_ioctl_cb { + __u32 flags; + __u16 uv_rc; /* UV header rc value */ + __u16 uv_rrc; /* UV header rrc value */ + __u64 argument_addr; /* Userspace address of uvio argument */ + __u32 argument_len; + __u8 reserved14[0x40 - 0x14]; /* must be zero */ +}; + +#define UVIO_ATT_USER_DATA_LEN 0x100 +#define UVIO_ATT_UID_LEN 0x10 +struct uvio_attest { + __u64 arcb_addr; /* 0x0000 */ + __u64 meas_addr; /* 0x0008 */ + __u64 add_data_addr; /* 0x0010 */ + __u8 user_data[UVIO_ATT_USER_DATA_LEN]; /* 0x0018 */ + __u8 config_uid[UVIO_ATT_UID_LEN]; /* 0x0118 */ + __u32 arcb_len; /* 0x0128 */ + __u32 meas_len; /* 0x012c */ + __u32 add_data_len; /* 0x0130 */ + __u16 user_data_len; /* 0x0134 */ + __u16 reserved136; /* 0x0136 */ +}; + +/* + * The following max values define an upper length for the IOCTL in/out buffers. + * However, they do not represent the maximum the Ultravisor allows which is + * often way smaller. By allowing larger buffer sizes we hopefully do not need + * to update the code with every machine update. It is therefore possible for + * userspace to request more memory than actually used by kernel/UV. + */ +#define UVIO_ATT_ARCB_MAX_LEN 0x100000 +#define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000 +#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000 + +#define UVIO_DEVICE_NAME "uv" +#define UVIO_TYPE_UVC 'u' + +#define UVIO_IOCTL_ATT _IOWR(UVIO_TYPE_UVC, 0x01, struct uvio_ioctl_cb) + +#endif /* __S390_ASM_UVDEVICE_H */ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index d53a183c2005..227ed0009354 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -491,8 +491,8 @@ enum prot_type { PROT_TYPE_IEP = 4, }; -static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, - u8 ar, enum gacc_mode mode, enum prot_type prot) +static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot, bool terminate) { struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; struct trans_exc_code_bits *tec; @@ -520,6 +520,11 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, tec->b61 = 1; break; } + if (terminate) { + tec->b56 = 0; + tec->b60 = 0; + tec->b61 = 0; + } fallthrough; case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: @@ -552,6 +557,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, return code; } +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot) +{ + return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false); +} + static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, unsigned long ga, u8 ar, enum gacc_mode mode) { @@ -1109,8 +1120,11 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, data += fragment_len; ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len); } - if (rc > 0) - rc = trans_exc(vcpu, rc, ga, ar, mode, prot); + if (rc > 0) { + bool terminate = (mode == GACC_STORE) && (idx > 0); + + rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); + } out_unlock: if (need_ipte_lock) ipte_unlock(vcpu); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 21bb78dfd41d..393f2bbb5e3a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -407,6 +407,7 @@ #define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ /* diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 1a6d7e3f6c32..da47f60a4650 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -127,6 +127,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) +KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h new file mode 100644 index 000000000000..fdfd8e06fee6 --- /dev/null +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL) +BUILD_BUG_ON(1) +#endif + +/* + * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate + * both DECLARE/DEFINE_STATIC_CALL() invocations and + * "static_call_update()" calls. + * + * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have + * a NULL definition, for example if "static_call_cond()" will be used + * at the call sites. + */ +KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(pmc_is_enabled) +KVM_X86_PMU_OP(pmc_idx_to_pmc) +KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) +KVM_X86_PMU_OP(msr_idx_to_pmc) +KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP(is_valid_msr) +KVM_X86_PMU_OP(get_msr) +KVM_X86_PMU_OP(set_msr) +KVM_X86_PMU_OP(refresh) +KVM_X86_PMU_OP(init) +KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) +KVM_X86_PMU_OP_OPTIONAL(cleanup) + +#undef KVM_X86_PMU_OP +#undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4ff36610af6a..959d66b9be94 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -281,11 +281,11 @@ struct kvm_kernel_irq_routing_entry; /* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in - * the given MMU context. This is a subset of the overall kvm_mmu_role to + * the given MMU context. This is a subset of the overall kvm_cpu_role to * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating * 2 bytes per gfn instead of 4 bytes per gfn. * - * Indirect upper-level shadow pages are tracked for write-protection via + * Upper-level shadow pages having gptes are tracked for write-protection via * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise * gfn_track will overflow and explosions will ensure. @@ -331,7 +331,8 @@ union kvm_mmu_page_role { unsigned smap_andnot_wp:1; unsigned ad_disabled:1; unsigned guest_mode:1; - unsigned :6; + unsigned passthrough:1; + unsigned :5; /* * This is left at the top of the word so that @@ -367,8 +368,6 @@ union kvm_mmu_extended_role { struct { unsigned int valid:1; unsigned int execonly:1; - unsigned int cr0_pg:1; - unsigned int cr4_pae:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; @@ -378,7 +377,7 @@ union kvm_mmu_extended_role { }; }; -union kvm_mmu_role { +union kvm_cpu_role { u64 as_u64; struct { union kvm_mmu_page_role base; @@ -438,19 +437,8 @@ struct kvm_mmu { struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); struct kvm_mmu_root_info root; - union kvm_mmu_role mmu_role; - u8 root_level; - u8 shadow_root_level; - u8 ept_ad; - bool direct_map; - struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; - - /* - * Bitmap; bit set = permission fault - * Byte index: page fault error code [4:1] - * Bit index: pte permissions in ACC_* format - */ - u8 permissions[16]; + union kvm_cpu_role cpu_role; + union kvm_mmu_page_role root_role; /* * The pkru_mask indicates if protection key checks are needed. It @@ -460,6 +448,15 @@ struct kvm_mmu { */ u32 pkru_mask; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; + + /* + * Bitmap; bit set = permission fault + * Byte index: page fault error code [4:1] + * Bit index: pte permissions in ACC_* format + */ + u8 permissions[16]; + u64 *pae_root; u64 *pml4_root; u64 *pml5_root; @@ -607,16 +604,21 @@ struct kvm_vcpu_hv { struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; - bool vcpu_info_set; - bool vcpu_time_info_set; - bool runstate_set; - struct gfn_to_hva_cache vcpu_info_cache; - struct gfn_to_hva_cache vcpu_time_info_cache; - struct gfn_to_hva_cache runstate_cache; + u8 upcall_vector; + struct gfn_to_pfn_cache vcpu_info_cache; + struct gfn_to_pfn_cache vcpu_time_info_cache; + struct gfn_to_pfn_cache runstate_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; unsigned long evtchn_pending_sel; + u32 vcpu_id; /* The Xen / ACPI vCPU ID */ + u32 timer_virq; + u64 timer_expires; /* In guest epoch */ + atomic_t timer_pending; + struct hrtimer timer; + int poll_evtchn; + struct timer_list poll_timer; }; struct kvm_vcpu_arch { @@ -753,8 +755,7 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; - struct gfn_to_hva_cache pv_time; - bool pv_time_enabled; + struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; @@ -1024,9 +1025,12 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + u32 xen_version; bool long_mode; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; + struct idr evtchn_ports; + unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; enum kvm_irqchip_mode { @@ -1119,6 +1123,8 @@ struct kvm_arch { u64 cur_tsc_generation; int nr_vcpus_matched_tsc; + u32 default_tsc_khz; + seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; @@ -1263,7 +1269,12 @@ struct kvm_vm_stat { struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; + u64 pf_taken; u64 pf_fixed; + u64 pf_emulate; + u64 pf_spurious; + u64 pf_fast; + u64 pf_mmio_spte_created; u64 pf_guest; u64 tlb_flush; u64 invlpg; @@ -1455,8 +1466,6 @@ struct kvm_x86_ops { int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); - /* pmu operations of sub-arch */ - const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; void (*vcpu_blocking)(struct kvm_vcpu *vcpu); @@ -1499,11 +1508,18 @@ struct kvm_x86_ops { int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); + + /* + * Returns vCPU specific APICv inhibit reasons + */ + unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { void (*leave_nested)(struct kvm_vcpu *vcpu); int (*check_events)(struct kvm_vcpu *vcpu); + bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu, + struct x86_exception *fault); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, @@ -1528,6 +1544,7 @@ struct kvm_x86_init_ops { unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; + struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { @@ -1549,20 +1566,6 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> -static inline void kvm_ops_static_call_update(void) -{ -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include <asm/kvm-x86-ops.h> -#undef __KVM_X86_OP -} - #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -1800,6 +1803,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); @@ -1989,6 +1993,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_CD_NW_CLEARED | \ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ - KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) + KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ + KVM_X86_QUIRK_FIX_HYPERCALL_INSN) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index f78e2b3501a1..35f222aa66bf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -382,6 +382,103 @@ do { \ #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT +#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ + bool success; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm_volatile_goto("\n" \ + "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ + _ASM_EXTABLE_UA(1b, %l[label]) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*_ptr), \ + [old] "+a" (__old) \ + : [new] ltype (__new) \ + : "memory" \ + : label); \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); }) + +#ifdef CONFIG_X86_32 +#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ + bool success; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm_volatile_goto("\n" \ + "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ + _ASM_EXTABLE_UA(1b, %l[label]) \ + : CC_OUT(z) (success), \ + "+A" (__old), \ + [ptr] "+m" (*_ptr) \ + : "b" ((u32)__new), \ + "c" ((u32)((u64)__new >> 32)) \ + : "memory" \ + : label); \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); }) +#endif // CONFIG_X86_32 +#else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT +#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ + int __err = 0; \ + bool success; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm volatile("\n" \ + "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ + CC_SET(z) \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ + %[errout]) \ + : CC_OUT(z) (success), \ + [errout] "+r" (__err), \ + [ptr] "+m" (*_ptr), \ + [old] "+a" (__old) \ + : [new] ltype (__new) \ + : "memory", "cc"); \ + if (unlikely(__err)) \ + goto label; \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); }) + +#ifdef CONFIG_X86_32 +/* + * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error. + * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are + * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses + * both ESI and EDI for the memory operand, compilation will fail if the error + * is an input+output as there will be no register available for input. + */ +#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ + int __result; \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + asm volatile("\n" \ + "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ + "mov $0, %%ecx\n\t" \ + "setz %%cl\n" \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \ + : [result]"=c" (__result), \ + "+A" (__old), \ + [ptr] "+m" (*_ptr) \ + : "b" ((u32)__new), \ + "c" ((u32)((u64)__new >> 32)) \ + : "memory", "cc"); \ + if (unlikely(__result < 0)) \ + goto label; \ + if (unlikely(!__result)) \ + *_old = __old; \ + likely(__result); }) +#endif // CONFIG_X86_32 +#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT + /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct __user *)(x)) @@ -474,6 +571,51 @@ do { \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT +extern void __try_cmpxchg_user_wrong_size(void); + +#ifndef CONFIG_X86_32 +#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \ + __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label) +#endif + +/* + * Force the pointer to u<size> to match the size expected by the asm helper. + * clang/LLVM compiles all cases and only discards the unused paths after + * processing errors, which breaks i386 if the pointer is an 8-byte value. + */ +#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ + bool __ret; \ + __chk_user_ptr(_ptr); \ + switch (sizeof(*(_ptr))) { \ + case 1: __ret = __try_cmpxchg_user_asm("b", "q", \ + (__force u8 *)(_ptr), (_oldp), \ + (_nval), _label); \ + break; \ + case 2: __ret = __try_cmpxchg_user_asm("w", "r", \ + (__force u16 *)(_ptr), (_oldp), \ + (_nval), _label); \ + break; \ + case 4: __ret = __try_cmpxchg_user_asm("l", "r", \ + (__force u32 *)(_ptr), (_oldp), \ + (_nval), _label); \ + break; \ + case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\ + (_nval), _label); \ + break; \ + default: __try_cmpxchg_user_wrong_size(); \ + } \ + __ret; }) + +/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */ +#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ + int __ret = -EFAULT; \ + __uaccess_begin_nospec(); \ + __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \ +_label: \ + __uaccess_end(); \ + __ret; \ + }) + /* * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0ffaa3156a4e..6c343c6a1855 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -543,16 +543,14 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_ACC_READ_BIT 0 #define EPT_VIOLATION_ACC_WRITE_BIT 1 #define EPT_VIOLATION_ACC_INSTR_BIT 2 -#define EPT_VIOLATION_READABLE_BIT 3 -#define EPT_VIOLATION_WRITABLE_BIT 4 -#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_RWX_SHIFT 3 +#define EPT_VIOLATION_GVA_IS_VALID_BIT 7 #define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 #define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) #define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) #define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) -#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) -#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) -#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) +#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) +#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT) #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index bf6e96011dfe..21614807a2cb 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -428,11 +428,12 @@ struct kvm_sync_regs { struct kvm_vcpu_events events; }; -#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) -#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) -#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) -#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) +#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b14533af7676..9b698215d261 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,7 +5,7 @@ #include <asm/ia32.h> -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) #include <asm/kvm_para.h> #endif @@ -20,7 +20,7 @@ int main(void) BLANK(); #endif -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); BLANK(); #endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e28ab0ecc537..0fdc807ae13f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -14,6 +14,8 @@ #include <asm/traps.h> #include <asm/irq_regs.h> +#include <uapi/asm/kvm.h> + #include <linux/hardirq.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> @@ -232,7 +234,20 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_user_cfg.default_features; gfpu->perm = fpu_user_cfg.default_features; - gfpu->uabi_size = fpu_user_cfg.default_size; + + /* + * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state + * to userspace, even when XSAVE is unsupported, so that restoring FPU + * state on a different CPU that does support XSAVE can cleanly load + * the incoming state using its natural XSAVE. In other words, KVM's + * uABI size may be larger than this host's default size. Conversely, + * the default size should never be larger than KVM's base uABI size; + * all features that can expand the uABI size must be opt-in. + */ + gfpu->uabi_size = sizeof(struct kvm_xsave); + if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) + gfpu->uabi_size = fpu_user_cfg.default_size; + fpu_init_guest_permissions(gfpu); return true; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8b1c45c9cda8..1a3658f7e6d9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -191,7 +191,7 @@ void kvm_async_pf_task_wake(u32 token) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node *n; + struct kvm_task_sleep_node *n, *dummy = NULL; if (token == ~0) { apf_task_wake_all(); @@ -203,28 +203,41 @@ again: n = _find_apf_task(b, token); if (!n) { /* - * async PF was not yet handled. - * Add dummy entry for the token. + * Async #PF not yet handled, add a dummy entry for the token. + * Allocating the token must be down outside of the raw lock + * as the allocator is preemptible on PREEMPT_RT kernels. */ - n = kzalloc(sizeof(*n), GFP_ATOMIC); - if (!n) { + if (!dummy) { + raw_spin_unlock(&b->lock); + dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC); + /* - * Allocation failed! Busy wait while other cpu - * handles async PF. + * Continue looping on allocation failure, eventually + * the async #PF will be handled and allocating a new + * node will be unnecessary. + */ + if (!dummy) + cpu_relax(); + + /* + * Recheck for async #PF completion before enqueueing + * the dummy token to avoid duplicate list entries. */ - raw_spin_unlock(&b->lock); - cpu_relax(); goto again; } - n->token = token; - n->cpu = smp_processor_id(); - init_swait_queue_head(&n->wq); - hlist_add_head(&n->link, &b->list); + dummy->token = token; + dummy->cpu = smp_processor_id(); + init_swait_queue_head(&dummy->wq); + hlist_add_head(&dummy->link, &b->list); + dummy = NULL; } else { apf_task_wake_one(n); } raw_spin_unlock(&b->lock); - return; + + /* A dummy token might be allocated and ultimately not used. */ + if (dummy) + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); @@ -765,6 +778,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs) } #endif +#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) +bool __kvm_vcpu_is_preempted(long cpu); + +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!(src->preempted & KVM_VCPU_PREEMPTED); +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include <asm/asm-offsets.h> + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +ASM_ENDBR +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +ASM_RET +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" +".popsection"); + +#endif + static void __init kvm_guest_init(void) { int i; @@ -777,6 +826,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; static_call_update(pv_steal_clock, kvm_steal_clock); + + pv_ops.lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -1018,40 +1070,6 @@ static void kvm_wait(u8 *ptr, u8 val) } } -#ifdef CONFIG_X86_32 -__visible bool __kvm_vcpu_is_preempted(long cpu) -{ - struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - - return !!(src->preempted & KVM_VCPU_PREEMPTED); -} -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); - -#else - -#include <asm/asm-offsets.h> - -extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); - -/* - * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and - * restoring to/from the stack. - */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -"__raw_callee_save___kvm_vcpu_is_preempted:" -ASM_ENDBR -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -ASM_RET -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); - -#endif - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -1095,10 +1113,6 @@ void __init kvm_spinlock_init(void) pv_ops.lock.wait = kvm_wait; pv_ops.lock.kick = kvm_kick_cpu; - if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_ops.lock.vcpu_is_preempted = - PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); - } /* * When PV spinlock is enabled which is preferred over * virt_spin_lock(), virt_spin_lock_key's value is meaningless. diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index c5caa7311bd8..16333ba1904b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { - if (!kvm_para_available() || !kvmclock) + if (!kvm_para_available() || !kvmclock || nopv) return 0; kvmclock_init_mem(); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index be99dc86293d..e1bb6218bb96 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm) */ irq2 = 7; intno = s->pics[1].irq_base + irq2; - irq = irq2 + 8; } else intno = s->pics[0].irq_base + irq; } else { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 172b05343cfd..f371f1292ca3 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -22,10 +22,14 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { + int r = 0; + if (lapic_in_kernel(vcpu)) - return apic_has_pending_timer(vcpu); + r = apic_has_pending_timer(vcpu); + if (kvm_xen_timer_enabled(vcpu)) + r += kvm_xen_has_pending_timer(vcpu); - return 0; + return r; } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); @@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) kvm_inject_apic_timer_irqs(vcpu); + if (kvm_xen_timer_enabled(vcpu)) + kvm_xen_inject_timer_irqs(vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6e0dab04320e..0687162c4f22 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (!level) return -1; - return kvm_xen_set_evtchn_fast(e, kvm); + return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); #endif default: break; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 66b0eb0bda94..f1bdac3f5aa8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1548,6 +1548,7 @@ static void cancel_apic_timer(struct kvm_lapic *apic) if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); preempt_enable(); + atomic_set(&apic->lapic_timer.pending, 0); } static void apic_update_lvtt(struct kvm_lapic *apic) @@ -1648,10 +1649,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); if (lapic_timer_advance_dynamic) { - adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); + adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); /* * If the timer fired early, reread the TSC to account for the * overhead of the above adjustment to avoid waiting longer diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4e4f8a22754f..65bb2a8cf145 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -38,7 +38,6 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; u32 timer_advance_ns; - s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; }; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index a335e7f1f69e..f8192864b496 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -89,7 +89,27 @@ static inline gfn_t kvm_mmu_max_gfn(void) return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1; } +static inline u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void kvm_init_mmu(struct kvm_vcpu *vcpu); @@ -138,94 +158,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) return; static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa, - vcpu->arch.mmu->shadow_root_level); -} - -struct kvm_page_fault { - /* arguments to kvm_mmu_do_page_fault. */ - const gpa_t addr; - const u32 error_code; - const bool prefetch; - - /* Derived from error_code. */ - const bool exec; - const bool write; - const bool present; - const bool rsvd; - const bool user; - - /* Derived from mmu and global state. */ - const bool is_tdp; - const bool nx_huge_page_workaround_enabled; - - /* - * Whether a >4KB mapping can be created or is forbidden due to NX - * hugepages. - */ - bool huge_page_disallowed; - - /* - * Maximum page size that can be created for this fault; input to - * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. - */ - u8 max_level; - - /* - * Page size that can be created based on the max_level and the - * page size used by the host mapping. - */ - u8 req_level; - - /* - * Page size that will be created based on the req_level and - * huge_page_disallowed. - */ - u8 goal_level; - - /* Shifted addr, or result of guest page table walk if addr is a gva. */ - gfn_t gfn; - - /* The memslot containing gfn. May be NULL. */ - struct kvm_memory_slot *slot; - - /* Outputs of kvm_faultin_pfn. */ - kvm_pfn_t pfn; - hva_t hva; - bool map_writable; -}; - -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); - -extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - -static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch) -{ - struct kvm_page_fault fault = { - .addr = cr2_or_gpa, - .error_code = err, - .exec = err & PFERR_FETCH_MASK, - .write = err & PFERR_WRITE_MASK, - .present = err & PFERR_PRESENT_MASK, - .rsvd = err & PFERR_RSVD_MASK, - .user = err & PFERR_USER_MASK, - .prefetch = prefetch, - .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), - - .max_level = KVM_MAX_HUGEPAGE_LEVEL, - .req_level = PG_LEVEL_4K, - .goal_level = PG_LEVEL_4K, - }; -#ifdef CONFIG_RETPOLINE - if (fault.is_tdp) - return kvm_tdp_page_fault(vcpu, &fault); -#endif - return vcpu->arch.mmu->page_fault(vcpu, &fault); + vcpu->arch.mmu->root_role.level); } /* diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 45e1573f8f1d..f4653688fa6d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -193,11 +193,12 @@ struct kvm_mmu_role_regs { /* * Yes, lot's of underscores. They're a hint that you probably shouldn't be - * reading from the role_regs. Once the mmu_role is constructed, it becomes + * reading from the role_regs. Once the root_role is constructed, it becomes * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ -static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +static inline bool __maybe_unused \ +____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \ { \ return !!(regs->reg & flag); \ } @@ -221,17 +222,26 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ - return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ + return !!(mmu->cpu_role. base_or_ext . reg##_##name); \ } -BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); -BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); +BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); + +static inline bool is_cr0_pg(struct kvm_mmu *mmu) +{ + return mmu->cpu_role.base.level > 0; +} + +static inline bool is_cr4_pae(struct kvm_mmu *mmu) +{ + return !mmu->cpu_role.base.has_4_byte_gpte; +} static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) { @@ -244,19 +254,6 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) return regs; } -static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) -{ - if (!____is_cr0_pg(regs)) - return 0; - else if (____is_efer_lma(regs)) - return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : - PT64_ROOT_4LEVEL; - else if (____is_cr4_pae(regs)) - return PT32E_ROOT_LEVEL; - else - return PT32_ROOT_LEVEL; -} - static inline bool kvm_available_flush_tlb_with_range(void) { return kvm_x86_ops.tlb_remote_flush_with_range; @@ -714,6 +711,9 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { + if (sp->role.passthrough) + return sp->gfn; + if (!sp->role.direct) return sp->gfns[index]; @@ -722,6 +722,11 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { + if (sp->role.passthrough) { + WARN_ON_ONCE(gfn != sp->gfn); + return; + } + if (!sp->role.direct) { sp->gfns[index] = gfn; return; @@ -1478,9 +1483,11 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { - if (++iterator->rmap <= iterator->end_rmap) { + while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); - return; + + if (iterator->rmap->val) + return; } if (++iterator->level > iterator->end_level) { @@ -1833,27 +1840,35 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); +static bool sp_has_gptes(struct kvm_mmu_page *sp) +{ + if (sp->role.direct) + return false; + + if (sp->role.passthrough) + return false; + + return true; +} + #define for_each_valid_sp(_kvm, _sp, _list) \ hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else -#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ - if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else + if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else -static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int ret = vcpu->arch.mmu->sync_page(vcpu, sp); - if (ret < 0) { + if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return false; - } - - return !!ret; + return ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, @@ -1975,7 +1990,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, for_each_sp(pages, sp, parents, i) { kvm_unlink_unsync_page(vcpu->kvm, sp); - flush |= kvm_sync_page(vcpu, sp, &invalid_list); + flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0; mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { @@ -2011,15 +2026,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { - bool direct_mmu = vcpu->arch.mmu->direct_map; + bool direct_mmu = vcpu->arch.mmu->root_role.direct; union kvm_mmu_page_role role; struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; + int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->mmu_role.base; + role = vcpu->arch.mmu->root_role; role.level = level; role.direct = direct; role.access = access; @@ -2028,6 +2044,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } + if (level <= vcpu->arch.mmu->cpu_role.base.level) + role.passthrough = 0; sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; for_each_valid_sp(vcpu->kvm, sp, sp_list) { @@ -2068,11 +2086,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * If the sync fails, the page is zapped. If so, break * in order to rebuild it. */ - if (!kvm_sync_page(vcpu, sp, &invalid_list)) + ret = kvm_sync_page(vcpu, sp, &invalid_list); + if (ret < 0) break; WARN_ON(!list_empty(&invalid_list)); - kvm_flush_remote_tlbs(vcpu->kvm); + if (ret > 0) + kvm_flush_remote_tlbs(vcpu->kvm); } __clear_sp_write_flooding_count(sp); @@ -2089,7 +2109,7 @@ trace_get_page: sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (!direct) { + if (sp_has_gptes(sp)) { account_shadowed(vcpu->kvm, sp); if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); @@ -2109,11 +2129,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato { iterator->addr = addr; iterator->shadow_addr = root; - iterator->level = vcpu->arch.mmu->shadow_root_level; + iterator->level = vcpu->arch.mmu->root_role.level; if (iterator->level >= PT64_ROOT_4LEVEL && - vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && - !vcpu->arch.mmu->direct_map) + vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL && + !vcpu->arch.mmu->root_role.direct) iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { @@ -2298,7 +2318,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; - if (!sp->role.invalid && !sp->role.direct) + if (!sp->role.invalid && sp_has_gptes(sp)) unaccount_shadowed(kvm, sp); if (sp->unsync) @@ -2478,7 +2498,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; write_lock(&kvm->mmu_lock); - for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; @@ -2495,7 +2515,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (vcpu->arch.mmu->direct_map) + if (vcpu->arch.mmu->root_role.direct) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); @@ -2540,7 +2560,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * that case, KVM must complete emulation of the guest TLB flush before * allowing shadow pages to become unsync (writable by the guest). */ - for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { if (!can_unsync) return -EPERM; @@ -2642,6 +2662,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, *sptep, write_fault, gfn); if (unlikely(is_noslot_pfn(pfn))) { + vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); return RET_PF_EMULATE; } @@ -2962,7 +2983,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; direct_pte_prefetch(vcpu, it.sptep); - ++vcpu->stat.pf_fixed; return ret; } @@ -2989,14 +3009,12 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access, int *ret_val) +static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access) { /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(fault->pfn))) { - *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); - return true; - } + if (unlikely(is_error_pfn(fault->pfn))) + return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -3013,44 +3031,48 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa * and only if L1's MAXPHYADDR is inaccurate with respect to * the hardware's). */ - if (unlikely(!shadow_mmio_value) || - unlikely(fault->gfn > kvm_mmu_max_gfn())) { - *ret_val = RET_PF_EMULATE; - return true; - } + if (unlikely(!enable_mmio_caching) || + unlikely(fault->gfn > kvm_mmu_max_gfn())) + return RET_PF_EMULATE; } - return false; + return RET_PF_CONTINUE; } static bool page_fault_can_be_fast(struct kvm_page_fault *fault) { /* - * Do not fix the mmio spte with invalid generation number which - * need to be updated by slow page fault path. + * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only + * reach the common page fault handler if the SPTE has an invalid MMIO + * generation number. Refreshing the MMIO generation needs to go down + * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag! */ if (fault->rsvd) return false; - /* See if the page fault is due to an NX violation */ - if (unlikely(fault->exec && fault->present)) - return false; - /* * #PF can be fast if: - * 1. The shadow page table entry is not present, which could mean that - * the fault is potentially caused by access tracking (if enabled). - * 2. The shadow page table entry is present and the fault - * is caused by write-protect, that means we just need change the W - * bit of the spte which can be done out of mmu-lock. * - * However, if access tracking is disabled we know that a non-present - * page must be a genuine page fault where we have to create a new SPTE. - * So, if access tracking is disabled, we return true only for write - * accesses to a present page. + * 1. The shadow page table entry is not present and A/D bits are + * disabled _by KVM_, which could mean that the fault is potentially + * caused by access tracking (if enabled). If A/D bits are enabled + * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D + * bits for L2 and employ access tracking, but the fast page fault + * mechanism only supports direct MMUs. + * 2. The shadow page table entry is present, the access is a write, + * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e. + * the fault was caused by a write-protection violation. If the + * SPTE is MMU-writable (determined later), the fault can be fixed + * by setting the Writable bit, which can be done out of mmu_lock. */ + if (!fault->present) + return !kvm_ad_enabled(); - return shadow_acc_track_mask != 0 || (fault->write && fault->present); + /* + * Note, instruction fetches and writes are mutually exclusive, ignore + * the "exec" flag. + */ + return fault->write; } /* @@ -3165,13 +3187,25 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) new_spte = spte; - if (is_access_track_spte(spte)) + /* + * KVM only supports fixing page faults outside of MMU lock for + * direct MMUs, nested MMUs are always indirect, and KVM always + * uses A/D bits for non-nested MMUs. Thus, if A/D bits are + * enabled, the SPTE can't be an access-tracked SPTE. + */ + if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) new_spte = restore_acc_track_spte(new_spte); /* - * Currently, to simplify the code, write-protection can - * be removed in the fast path only if the SPTE was - * write-protected for dirty-logging or access tracking. + * To keep things simple, only SPTEs that are MMU-writable can + * be made fully writable outside of mmu_lock, e.g. only SPTEs + * that were write-protected for dirty-logging or access + * tracking are handled here. Don't bother checking if the + * SPTE is writable to prioritize running with A/D bits enabled. + * The is_access_allowed() check above handles the common case + * of the fault being spurious, and the SPTE is known to be + * shadow-present, i.e. except for access tracking restoration + * making the new SPTE writable, the check is wasteful. */ if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; @@ -3217,6 +3251,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); + if (ret != RET_PF_INVALID) + vcpu->stat.pf_fast++; + return ret; } @@ -3303,7 +3340,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) * This should not be called while L2 is active, L2 can't invalidate * _only_ its own roots, e.g. INVVPID unconditionally exits. */ - WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); + WARN_ON_ONCE(mmu->root_role.guest_mode); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { root_hpa = mmu->prev_roots[i].hpa; @@ -3346,7 +3383,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u8 shadow_root_level = mmu->shadow_root_level; + u8 shadow_root_level = mmu->root_role.level; hpa_t root; unsigned i; int r; @@ -3470,7 +3507,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * On SVM, reading PDPTRs might access guest memory, which might fault * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. */ - if (mmu->root_level == PT32E_ROOT_LEVEL) { + if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { pdptrs[i] = mmu->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) @@ -3494,9 +3531,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (mmu->root_level >= PT64_ROOT_4LEVEL) { + if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->shadow_root_level, false); + mmu->root_role.level, false); mmu->root.hpa = root; goto set_root_pgd; } @@ -3511,8 +3548,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ - pm_mask = PT_PRESENT_MASK | shadow_me_mask; - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + pm_mask = PT_PRESENT_MASK | shadow_me_value; + if (mmu->root_role.level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { @@ -3521,7 +3558,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; - if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (mmu->root_role.level == PT64_ROOT_5LEVEL) { if (WARN_ON_ONCE(!mmu->pml5_root)) { r = -EIO; goto out_unlock; @@ -3533,7 +3570,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - if (mmu->root_level == PT32E_ROOT_LEVEL) { + if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { if (!(pdptrs[i] & PT_PRESENT_MASK)) { mmu->pae_root[i] = INVALID_PAE_ROOT; continue; @@ -3546,9 +3583,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } - if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + if (mmu->root_role.level == PT64_ROOT_5LEVEL) mmu->root.hpa = __pa(mmu->pml5_root); - else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + else if (mmu->root_role.level == PT64_ROOT_4LEVEL) mmu->root.hpa = __pa(mmu->pml4_root); else mmu->root.hpa = __pa(mmu->pae_root); @@ -3564,7 +3601,7 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; + bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL; u64 *pml5_root = NULL; u64 *pml4_root = NULL; u64 *pae_root; @@ -3575,8 +3612,9 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) * equivalent level in the guest's NPT to shadow. Allocate the tables * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ - if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || - mmu->shadow_root_level < PT64_ROOT_4LEVEL) + if (mmu->root_role.direct || + mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL || + mmu->root_role.level < PT64_ROOT_4LEVEL) return 0; /* @@ -3672,7 +3710,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *sp; - if (vcpu->arch.mmu->direct_map) + if (vcpu->arch.mmu->root_role.direct) return; if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) @@ -3680,7 +3718,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; sp = to_shadow_page(root); @@ -3902,14 +3940,33 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, arch.token = alloc_apf_token(vcpu); arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu->direct_map; + arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) +{ + int r; + + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || + work->wakeup_all) + return; + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return; + + if (!vcpu->arch.mmu->root_role.direct && + work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) + return; + + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); +} + +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; @@ -3920,7 +3977,7 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - goto out_retry; + return RET_PF_RETRY; if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ @@ -3928,7 +3985,7 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; - return false; + return RET_PF_CONTINUE; } /* * If the APIC access page exists but is disabled, go directly @@ -3937,10 +3994,8 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * when the AVIC is re-enabled. */ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && - !kvm_apicv_activated(vcpu->kvm)) { - *r = RET_PF_EMULATE; - return true; - } + !kvm_apicv_activated(vcpu->kvm)) + return RET_PF_EMULATE; } async = false; @@ -3948,26 +4003,23 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, fault->write, &fault->map_writable, &fault->hva); if (!async) - return false; /* *pfn has correct page already */ + return RET_PF_CONTINUE; /* *pfn has correct page already */ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); - goto out_retry; - } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) - goto out_retry; + return RET_PF_RETRY; + } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { + return RET_PF_RETRY; + } } fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, fault->write, &fault->map_writable, &fault->hva); - return false; - -out_retry: - *r = RET_PF_RETRY; - return true; + return RET_PF_CONTINUE; } /* @@ -4022,10 +4074,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, fault, &r)) + r = kvm_faultin_pfn(vcpu, fault); + if (r != RET_PF_CONTINUE) return r; - if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) + r = handle_abnormal_pfn(vcpu, fault, ACC_ALL); + if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; @@ -4120,7 +4174,6 @@ static void nonpaging_init_context(struct kvm_mmu *context) context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->direct_map = true; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, @@ -4214,7 +4267,7 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { struct kvm_mmu *mmu = vcpu->arch.mmu; - union kvm_mmu_page_role new_role = mmu->mmu_role.base; + union kvm_mmu_page_role new_role = mmu->root_role; if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) { /* kvm_mmu_ensure_valid_pgd will set up a new root. */ @@ -4391,12 +4444,12 @@ static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); } -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, - context->root_level, is_efer_nx(context), + context->cpu_role.base.level, is_efer_nx(context), guest_can_use_gbpages(vcpu), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); @@ -4461,16 +4514,6 @@ static inline u64 reserved_hpa_bits(void) static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - /* - * KVM uses NX when TDP is disabled to handle a variety of scenarios, - * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and - * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. - * The iTLB multi-hit workaround can be toggled at any time, so assume - * NX can be used by any non-nested shadow MMU to avoid having to reset - * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. - */ - bool uses_nx = is_efer_nx(context) || !tdp_enabled; - /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ bool is_amd = true; /* KVM doesn't use 2-level page tables for the shadow MMU. */ @@ -4478,19 +4521,28 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct rsvd_bits_validate *shadow_zero_check; int i; - WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); + WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL); shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->shadow_root_level, uses_nx, + context->root_role.level, + context->root_role.efer_nx, guest_can_use_gbpages(vcpu), is_pse, is_amd); if (!shadow_me_mask) return; - for (i = context->shadow_root_level; --i >= 0;) { - shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; - shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + for (i = context->root_role.level; --i >= 0;) { + /* + * So far shadow_me_value is a constant during KVM's life + * time. Bits in shadow_me_value are allowed to be set. + * Bits in shadow_me_mask but not in shadow_me_value are + * not allowed to be set. + */ + shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value; } } @@ -4515,7 +4567,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->shadow_root_level, false, + context->root_role.level, false, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else @@ -4526,7 +4578,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (!shadow_me_mask) return; - for (i = context->shadow_root_level; --i >= 0;) { + for (i = context->root_role.level; --i >= 0;) { shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; } @@ -4700,7 +4752,7 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, if (!is_cr0_pg(mmu)) return; - reset_rsvds_bits_mask(vcpu, mmu); + reset_guest_rsvds_bits_mask(vcpu, mmu); update_permission_bitmask(mmu, false); update_pkru_bitmask(mmu); } @@ -4711,7 +4763,6 @@ static void paging64_init_context(struct kvm_mmu *context) context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->direct_map = false; } static void paging32_init_context(struct kvm_mmu *context) @@ -4720,51 +4771,45 @@ static void paging32_init_context(struct kvm_mmu *context) context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->direct_map = false; -} - -static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs) -{ - union kvm_mmu_extended_role ext = {0}; - - if (____is_cr0_pg(regs)) { - ext.cr0_pg = 1; - ext.cr4_pae = ____is_cr4_pae(regs); - ext.cr4_smep = ____is_cr4_smep(regs); - ext.cr4_smap = ____is_cr4_smap(regs); - ext.cr4_pse = ____is_cr4_pse(regs); - - /* PKEY and LA57 are active iff long mode is active. */ - ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); - ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); - ext.efer_lma = ____is_efer_lma(regs); - } - - ext.valid = 1; - - return ext; } -static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs, - bool base_only) +static union kvm_cpu_role +kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) { - union kvm_mmu_role role = {0}; + union kvm_cpu_role role = {0}; role.base.access = ACC_ALL; - if (____is_cr0_pg(regs)) { - role.base.efer_nx = ____is_efer_nx(regs); - role.base.cr0_wp = ____is_cr0_wp(regs); - } role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); + role.ext.valid = 1; - if (base_only) + if (!____is_cr0_pg(regs)) { + role.base.direct = 1; return role; + } - role.ext = kvm_calc_mmu_role_ext(vcpu, regs); + role.base.efer_nx = ____is_efer_nx(regs); + role.base.cr0_wp = ____is_cr0_wp(regs); + role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs); + role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs); + role.base.has_4_byte_gpte = !____is_cr4_pae(regs); + if (____is_efer_lma(regs)) + role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL + : PT64_ROOT_4LEVEL; + else if (____is_cr4_pae(regs)) + role.base.level = PT32E_ROOT_LEVEL; + else + role.base.level = PT32_ROOT_LEVEL; + + role.ext.cr4_smep = ____is_cr4_smep(regs); + role.ext.cr4_smap = ____is_cr4_smap(regs); + role.ext.cr4_pse = ____is_cr4_pse(regs); + + /* PKEY and LA57 are active iff long mode is active. */ + role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); + role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + role.ext.efer_lma = ____is_efer_lma(regs); return role; } @@ -4781,40 +4826,43 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) return max_tdp_level; } -static union kvm_mmu_role +static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs, bool base_only) + union kvm_cpu_role cpu_role) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); + union kvm_mmu_page_role role = {0}; - role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = kvm_mmu_get_tdp_level(vcpu); - role.base.direct = true; - role.base.has_4_byte_gpte = false; + role.access = ACC_ALL; + role.cr0_wp = true; + role.efer_nx = true; + role.smm = cpu_role.base.smm; + role.guest_mode = cpu_role.base.guest_mode; + role.ad_disabled = !kvm_ad_enabled(); + role.level = kvm_mmu_get_tdp_level(vcpu); + role.direct = true; + role.has_4_byte_gpte = false; return role; } -static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, + union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; - struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - union kvm_mmu_role new_role = - kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); + union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role); - if (new_role.as_u64 == context->mmu_role.as_u64) + if (cpu_role.as_u64 == context->cpu_role.as_u64 && + root_role.word == context->root_role.word) return; - context->mmu_role.as_u64 = new_role.as_u64; + context->cpu_role.as_u64 = cpu_role.as_u64; + context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); - context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; - context->root_level = role_regs_to_root_level(®s); if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -4827,46 +4875,16 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) reset_tdp_shadow_zero_bits_mask(context); } -static union kvm_mmu_role -kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs, bool base_only) -{ - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); - - role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); - role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); - role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); - - return role; -} - -static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs, bool base_only) -{ - union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); - - role.base.direct = !____is_cr0_pg(regs); - - if (!____is_efer_lma(regs)) - role.base.level = PT32E_ROOT_LEVEL; - else if (____is_cr4_la57(regs)) - role.base.level = PT64_ROOT_5LEVEL; - else - role.base.level = PT64_ROOT_4LEVEL; - - return role; -} - static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - struct kvm_mmu_role_regs *regs, - union kvm_mmu_role new_role) + union kvm_cpu_role cpu_role, + union kvm_mmu_page_role root_role) { - if (new_role.as_u64 == context->mmu_role.as_u64) + if (cpu_role.as_u64 == context->cpu_role.as_u64 && + root_role.word == context->root_role.word) return; - context->mmu_role.as_u64 = new_role.as_u64; + context->cpu_role.as_u64 = cpu_role.as_u64; + context->root_role.word = root_role.word; if (!is_cr0_pg(context)) nonpaging_init_context(context); @@ -4874,35 +4892,34 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte paging64_init_context(context); else paging32_init_context(context); - context->root_level = role_regs_to_root_level(regs); reset_guest_paging_metadata(vcpu, context); - context->shadow_root_level = new_role.base.level; - reset_shadow_zero_bits_mask(vcpu, context); } static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs) + union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); + union kvm_mmu_page_role root_role; - shadow_mmu_init_context(vcpu, context, regs, new_role); -} + root_role = cpu_role.base; -static union kvm_mmu_role -kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, - struct kvm_mmu_role_regs *regs) -{ - union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, regs, false); + /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */ + root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL); - role.base.direct = false; - role.base.level = kvm_mmu_get_tdp_level(vcpu); + /* + * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role. + * KVM uses NX when TDP is disabled to handle a variety of scenarios, + * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and + * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. + * The iTLB multi-hit workaround can be toggled at any time, so assume + * NX can be used by any non-nested shadow MMU to avoid having to reset + * MMU contexts. + */ + root_role.efer_nx = true; - return role; + shadow_mmu_init_context(vcpu, context, cpu_role, root_role); } void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, @@ -4914,24 +4931,34 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; - union kvm_mmu_role new_role; + union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); + union kvm_mmu_page_role root_role; + + /* NPT requires CR0.PG=1. */ + WARN_ON_ONCE(cpu_role.base.direct); - new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); + root_role = cpu_role.base; + root_role.level = kvm_mmu_get_tdp_level(vcpu); + if (root_role.level == PT64_ROOT_5LEVEL && + cpu_role.base.level == PT64_ROOT_4LEVEL) + root_role.passthrough = 1; - shadow_mmu_init_context(vcpu, context, ®s, new_role); + shadow_mmu_init_context(vcpu, context, cpu_role, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); -static union kvm_mmu_role +static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, bool execonly, u8 level) { - union kvm_mmu_role role = {0}; - - /* SMM flag is inherited from root_mmu */ - role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; + union kvm_cpu_role role = {0}; + /* + * KVM does not support SMM transfer monitors, and consequently does not + * support the "entry to SMM" control either. role.base.smm is always 0. + */ + WARN_ON_ONCE(is_smm(vcpu)); role.base.level = level; role.base.has_4_byte_gpte = false; role.base.direct = false; @@ -4939,7 +4966,6 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.guest_mode = true; role.base.access = ACC_ALL; - /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ role.ext.word = 0; role.ext.execonly = execonly; role.ext.valid = 1; @@ -4953,22 +4979,20 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); - union kvm_mmu_role new_role = + union kvm_cpu_role new_mode = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - if (new_role.as_u64 != context->mmu_role.as_u64) { - context->mmu_role.as_u64 = new_role.as_u64; + if (new_mode.as_u64 != context->cpu_role.as_u64) { + /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ + context->cpu_role.as_u64 = new_mode.as_u64; + context->root_role.word = new_mode.base.word; - context->shadow_root_level = level; - - context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; - context->root_level = level; - context->direct_map = false; + update_permission_bitmask(context, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); @@ -4979,49 +5003,30 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); -static void init_kvm_softmmu(struct kvm_vcpu *vcpu) +static void init_kvm_softmmu(struct kvm_vcpu *vcpu, + union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; - struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - kvm_init_shadow_mmu(vcpu, ®s); + kvm_init_shadow_mmu(vcpu, cpu_role); context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } -static union kvm_mmu_role -kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) +static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, + union kvm_cpu_role new_mode) { - union kvm_mmu_role role; - - role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); - - /* - * Nested MMUs are used only for walking L2's gva->gpa, they never have - * shadow pages of their own and so "direct" has no meaning. Set it - * to "true" to try to detect bogus usage of the nested MMU. - */ - role.base.direct = true; - role.base.level = role_regs_to_root_level(regs); - return role; -} - -static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; - if (new_role.as_u64 == g_context->mmu_role.as_u64) + if (new_mode.as_u64 == g_context->cpu_role.as_u64) return; - g_context->mmu_role.as_u64 = new_role.as_u64; + g_context->cpu_role.as_u64 = new_mode.as_u64; g_context->get_guest_pgd = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; - g_context->root_level = new_role.base.level; /* * L2 page tables are never shadowed, so there is no need to sync @@ -5051,12 +5056,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) void kvm_init_mmu(struct kvm_vcpu *vcpu) { + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); + union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); + if (mmu_is_nested(vcpu)) - init_kvm_nested_mmu(vcpu); + init_kvm_nested_mmu(vcpu, cpu_role); else if (tdp_enabled) - init_kvm_tdp_mmu(vcpu); + init_kvm_tdp_mmu(vcpu, cpu_role); else - init_kvm_softmmu(vcpu); + init_kvm_softmmu(vcpu, cpu_role); } EXPORT_SYMBOL_GPL(kvm_init_mmu); @@ -5074,9 +5082,12 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ - vcpu->arch.root_mmu.mmu_role.ext.valid = 0; - vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; - vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; + vcpu->arch.root_mmu.root_role.word = 0; + vcpu->arch.guest_mmu.root_role.word = 0; + vcpu->arch.nested_mmu.root_role.word = 0; + vcpu->arch.root_mmu.cpu_role.ext.valid = 0; + vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; + vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); /* @@ -5097,13 +5108,13 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); if (r) goto out; r = mmu_alloc_special_roots(vcpu); if (r) goto out; - if (vcpu->arch.mmu->direct_map) + if (vcpu->arch.mmu->root_role.direct) r = mmu_alloc_direct_roots(vcpu); else r = mmu_alloc_shadow_roots(vcpu); @@ -5330,7 +5341,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { + for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); @@ -5356,11 +5367,11 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, write_unlock(&vcpu->kvm->mmu_lock); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, +int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_PF; - bool direct = vcpu->arch.mmu->direct_map; + bool direct = vcpu->arch.mmu->root_role.direct; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; @@ -5391,7 +5402,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, * paging in both guests. If true, we simply unprotect the page * and resume the guest. */ - if (vcpu->arch.mmu->direct_map && + if (vcpu->arch.mmu->root_role.direct && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; @@ -5625,7 +5636,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) if (!tdp_enabled) set_memory_decrypted((unsigned long)mmu->pae_root, 1); else - WARN_ON_ONCE(shadow_me_mask); + WARN_ON_ONCE(shadow_me_value); for (i = 0; i < 4; ++i) mmu->pae_root[i] = INVALID_PAE_ROOT; @@ -6287,7 +6298,7 @@ int kvm_mmu_vendor_module_init(void) */ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); - BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64)); + BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64)); kvm_mmu_reset_all_pte_masks(); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1bff453f7cbe..bd2a26897b97 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -140,9 +140,72 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + +struct kvm_page_fault { + /* arguments to kvm_mmu_do_page_fault. */ + const gpa_t addr; + const u32 error_code; + const bool prefetch; + + /* Derived from error_code. */ + const bool exec; + const bool write; + const bool present; + const bool rsvd; + const bool user; + + /* Derived from mmu and global state. */ + const bool is_tdp; + const bool nx_huge_page_workaround_enabled; + + /* + * Whether a >4KB mapping can be created or is forbidden due to NX + * hugepages. + */ + bool huge_page_disallowed; + + /* + * Maximum page size that can be created for this fault; input to + * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + */ + u8 max_level; + + /* + * Page size that can be created based on the max_level and the + * page size used by the host mapping. + */ + u8 req_level; + + /* + * Page size that will be created based on the req_level and + * huge_page_disallowed. + */ + u8 goal_level; + + /* Shifted addr, or result of guest page table walk if addr is a gva. */ + gfn_t gfn; + + /* The memslot containing gfn. May be NULL. */ + struct kvm_memory_slot *slot; + + /* Outputs of kvm_faultin_pfn. */ + kvm_pfn_t pfn; + hva_t hva; + bool map_writable; +}; + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); + /* - * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(), + * and of course kvm_mmu_do_page_fault(). * + * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. @@ -151,15 +214,71 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); * * Any names added to this enum should be exported to userspace for use in * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h + * + * Note, all values must be greater than or equal to zero so as not to encroach + * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which + * will allow for efficient machine code when checking for CONTINUE, e.g. + * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero. */ enum { - RET_PF_RETRY = 0, + RET_PF_CONTINUE = 0, + RET_PF_RETRY, RET_PF_EMULATE, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, }; +static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 err, bool prefetch) +{ + struct kvm_page_fault fault = { + .addr = cr2_or_gpa, + .error_code = err, + .exec = err & PFERR_FETCH_MASK, + .write = err & PFERR_WRITE_MASK, + .present = err & PFERR_PRESENT_MASK, + .rsvd = err & PFERR_RSVD_MASK, + .user = err & PFERR_USER_MASK, + .prefetch = prefetch, + .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + }; + int r; + + /* + * Async #PF "faults", a.k.a. prefetch faults, are not faults from the + * guest perspective and have already been counted at the time of the + * original fault. + */ + if (!prefetch) + vcpu->stat.pf_taken++; + + if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp) + r = kvm_tdp_page_fault(vcpu, &fault); + else + r = vcpu->arch.mmu->page_fault(vcpu, &fault); + + /* + * Similar to above, prefetch faults aren't truly spurious, and the + * async #PF path doesn't do emulation. Do count faults that are fixed + * by the async #PF handler though, otherwise they'll never be counted. + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (prefetch) + ; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; + return r; +} + int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t pfn, int max_level); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 12247b96af01..ae86820cef69 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,7 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); TRACE_DEFINE_ENUM(RET_PF_INVALID); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 01fee5f67ac3..db80f7ccaa4e 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -63,7 +63,7 @@ #define PT_LEVEL_BITS PT64_LEVEL_BITS #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 - #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) + #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) #ifdef CONFIG_X86_64 #define CMPXCHG "cmpxchgq" #endif @@ -144,42 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); } -static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t __user *ptep_user, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) -{ - signed char r; - - if (!user_access_begin(ptep_user, sizeof(pt_element_t))) - return -EFAULT; - -#ifdef CMPXCHG - asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n" - "setnz %b[r]\n" - "2:" - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) - : [ptr] "+m" (*ptep_user), - [old] "+a" (orig_pte), - [r] "=q" (r) - : [new] "r" (new_pte) - : "memory"); -#else - asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n" - "setnz %b[r]\n" - "2:" - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) - : [ptr] "+m" (*ptep_user), - [old] "+A" (orig_pte), - [r] "=q" (r) - : [new_lo] "b" ((u32)new_pte), - [new_hi] "c" ((u32)(new_pte >> 32)) - : "memory"); -#endif - - user_access_end(); - return r; -} - static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) @@ -187,7 +151,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, if (!FNAME(is_present_gpte)(gpte)) goto no_present; - /* if accessed bit is not supported prefetch non accessed gpte */ + /* Prefetch only accessed entries (unless A/D bits are disabled). */ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; @@ -278,7 +242,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (unlikely(!walker->pte_writable[level - 1])) continue; - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); if (ret) return ret; @@ -317,7 +281,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, * is not reserved and does not indicate a large page at this level, * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. */ - gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse); + gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); #endif /* * PG_LEVEL_4K always terminates. The RHS has bit 7 set @@ -355,7 +319,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - walker->level = mmu->root_level; + walker->level = mmu->cpu_role.base.level; pte = mmu->get_guest_pgd(vcpu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); @@ -515,14 +479,21 @@ error: * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= 0x180; + vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); if (write_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; - vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; + + /* + * Note, pte_access holds the raw RWX bits from the EPTE, not + * ACC_*_MASK flags! + */ + vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << + EPT_VIOLATION_RWX_SHIFT; } #endif walker->fault.address = addr; @@ -650,7 +621,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; - top_level = vcpu->arch.mmu->root_level; + top_level = vcpu->arch.mmu->cpu_role.base.level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; /* @@ -752,7 +723,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return ret; FNAME(pte_prefetch)(vcpu, gw, it.sptep); - ++vcpu->stat.pf_fixed; return ret; out_gpte_changed: @@ -867,10 +837,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, fault, &r)) + r = kvm_faultin_pfn(vcpu, fault); + if (r != RET_PF_CONTINUE) return r; - if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r)) + r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); + if (r != RET_PF_CONTINUE) return r; /* @@ -1017,7 +989,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; + union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; int i; bool host_writable; gpa_t first_pte_gpa; @@ -1036,6 +1008,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) .level = 0xf, .access = 0x7, .quadrant = 0x3, + .passthrough = 0x1, }; /* @@ -1045,7 +1018,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * reserved bits checks will be wrong, etc... */ if (WARN_ON_ONCE(sp->role.direct || - (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) + (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) return -1; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index e5c0b6db6f2c..b5960bbde7f7 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -19,7 +19,7 @@ #include <asm/memtype.h> #include <asm/vmx.h> -static bool __read_mostly enable_mmio_caching = true; +bool __read_mostly enable_mmio_caching = true; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); u64 __read_mostly shadow_host_writable_mask; @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -167,8 +168,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, else pte_access &= ~ACC_WRITE_MASK; - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; + if (shadow_me_value && !kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_value; spte |= (u64)pfn << PAGE_SHIFT; @@ -284,7 +285,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) u64 spte = SPTE_MMU_PRESENT_MASK; spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; + shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; @@ -310,25 +311,6 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) return new_spte; } -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - u64 mark_spte_for_access_track(u64 spte) { if (spte_ad_enabled(spte)) @@ -379,12 +361,26 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; + if (!mmio_value) + enable_mmio_caching = false; + shadow_mmio_value = mmio_value; shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) +{ + /* shadow_me_value must be a subset of shadow_me_mask */ + if (WARN_ON(me_value & ~me_mask)) + me_value = me_mask = 0; + + shadow_me_value = me_value; + shadow_me_mask = me_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); + void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { shadow_user_mask = VMX_EPT_READABLE_MASK; @@ -394,8 +390,6 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; shadow_acc_track_mask = VMX_EPT_RWX_MASK; - shadow_me_mask = 0ull; - shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -446,7 +440,8 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; shadow_acc_track_mask = 0; - shadow_me_mask = sme_me_mask; + shadow_me_mask = 0; + shadow_me_value = 0; shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 80ab0f5cff01..0127bb6e3c7d 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,6 +5,8 @@ #include "mmu_internal.h" +extern bool __read_mostly enable_mmio_caching; + /* * A MMU present SPTE is backed by actual memory and may or may not be present * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it @@ -149,6 +151,7 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* @@ -204,7 +207,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && - likely(shadow_mmio_value); + likely(enable_mmio_caching); } static inline bool is_shadow_present_pte(u64 pte) @@ -212,6 +215,17 @@ static inline bool is_shadow_present_pte(u64 pte) return !!(pte & SPTE_MMU_PRESENT_MASK); } +/* + * Returns true if A/D bits are supported in hardware and are enabled by KVM. + * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can + * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the + * scenario where KVM is using A/D bits for L1, but not L2. + */ +static inline bool kvm_ad_enabled(void) +{ + return !!shadow_accessed_mask; +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 922b06bf4b94..841feaa48be5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -310,7 +310,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base; + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; @@ -1100,6 +1100,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ if (unlikely(is_mmio_spte(new_spte))) { + vcpu->stat.pf_mmio_spte_created++; trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); ret = RET_PF_EMULATE; @@ -1108,13 +1109,6 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, rcu_dereference(iter->sptep)); } - /* - * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be - * consistent with legacy MMU behavior. - */ - if (ret != RET_PF_SPURIOUS) - vcpu->stat.pf_fixed++; - return ret; } @@ -1136,7 +1130,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool account_nx, bool shared) { - u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); + u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); int ret = 0; if (shared) { @@ -1859,7 +1853,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->shadow_root_level; + *root_level = vcpu->arch.mmu->root_role.level; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0604bc29f0b8..3f868fed9114 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -49,6 +49,32 @@ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters */ +static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; + +#define KVM_X86_PMU_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ + *(((struct kvm_pmu_ops *)0)->func)); +#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP +#include <asm/kvm-x86-pmu-ops.h> + +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) +{ + memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); + +#define __KVM_X86_PMU_OP(func) \ + static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); +#define KVM_X86_PMU_OP(func) \ + WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) +#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP +#include <asm/kvm-x86-pmu-ops.h> +#undef __KVM_X86_PMU_OP +} + +static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +{ + return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); +} + static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); @@ -216,7 +242,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -266,7 +292,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), + static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi); @@ -275,7 +301,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); if (!pmc) return; @@ -297,7 +323,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) int bit; for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { clear_bit(bit, pmu->reprogram_pmi); @@ -319,7 +345,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); + return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -369,7 +395,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); + pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; @@ -385,22 +411,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - if (kvm_x86_ops.pmu_ops->deliver_pmi) - kvm_x86_ops.pmu_ops->deliver_pmi(vcpu); + static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) || - kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr); + return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || + static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr); + struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -408,13 +433,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); } /* refresh PMU settings. This function generally is called when underlying @@ -423,7 +448,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - kvm_x86_ops.pmu_ops->refresh(vcpu); + static_call(kvm_x86_pmu_refresh)(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) @@ -431,7 +456,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); irq_work_sync(&pmu->irq_work); - kvm_x86_ops.pmu_ops->reset(vcpu); + static_call(kvm_x86_pmu_reset)(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -439,7 +464,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - kvm_x86_ops.pmu_ops->init(vcpu); + static_call(kvm_x86_pmu_init)(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; @@ -471,14 +496,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); } - if (kvm_x86_ops.pmu_ops->cleanup) - kvm_x86_ops.pmu_ops->cleanup(vcpu); + static_call_cond(kvm_x86_pmu_cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -508,7 +532,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int config; pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); pmc->eventsel = old_eventsel; return config == perf_hw_id; } @@ -536,7 +560,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) int i; for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) continue; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 22992b049d38..e745f443b6a8 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -39,6 +39,8 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); }; +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -86,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) return pmc->type == KVM_PMC_FIXED; } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) -{ - return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc); -} - static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 421619540ff9..54fe03714f8a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -165,9 +165,8 @@ free_avic: return err; } -void avic_init_vmcb(struct vcpu_svm *svm) +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { - struct vmcb *vmcb = svm->vmcb; struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); @@ -285,11 +284,77 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) put_cpu(); } -static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, - u32 icrl, u32 icrh) +/* + * A fast-path version of avic_kick_target_vcpus(), which attempts to match + * destination APIC ID to vCPU without looping through all vCPUs. + */ +static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, + u32 icrl, u32 icrh, u32 index) { + u32 dest, apic_id; struct kvm_vcpu *vcpu; + int dest_mode = icrl & APIC_DEST_MASK; + int shorthand = icrl & APIC_SHORT_MASK; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); + + if (shorthand != APIC_DEST_NOSHORT) + return -EINVAL; + + /* + * The AVIC incomplete IPI #vmexit info provides index into + * the physical APIC ID table, which can be used to derive + * guest physical APIC ID. + */ + if (dest_mode == APIC_DEST_PHYSICAL) { + apic_id = index; + } else { + if (!apic_x2apic_mode(source)) { + /* For xAPIC logical mode, the index is for logical APIC table. */ + apic_id = avic_logical_id_table[index] & 0x1ff; + } else { + return -EINVAL; + } + } + + /* + * Assuming vcpu ID is the same as physical apic ID, + * and use it to retrieve the target vCPU. + */ + vcpu = kvm_get_vcpu_by_id(kvm, apic_id); + if (!vcpu) + return -EINVAL; + + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_APIC_DEST_FIELD(icrh); + + /* + * Try matching the destination APIC ID with the vCPU. + */ + if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) { + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + return 0; + } + + return -EINVAL; +} + +static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, + u32 icrl, u32 icrh, u32 index) +{ unsigned long i; + struct kvm_vcpu *vcpu; + + if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) + return; + + trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); /* * Wake any target vCPUs that are blocking, i.e. waiting for a wake @@ -316,7 +381,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0xFF; + u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); @@ -343,7 +408,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) * set the appropriate IRR bits on the valid target * vcpus. So, we just need to kick the appropriate vcpu. */ - avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); + avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); break; case AVIC_IPI_FAILURE_INVALID_TARGET: break; @@ -357,6 +422,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) return 1; } +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return APICV_INHIBIT_REASON_NESTED; + return 0; +} + static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96bab464967f..bed5e1692cef 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -36,40 +36,45 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + if (vmcb->control.exit_code != SVM_EXIT_NPF) { /* * TODO: track the cause of the nested page fault, and * correctly fill in the high bits of exit_info_1. */ - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = (1ULL << 32); - svm->vmcb->control.exit_info_2 = fault->address; + vmcb->control.exit_code = SVM_EXIT_NPF; + vmcb->control.exit_code_hi = 0; + vmcb->control.exit_info_1 = (1ULL << 32); + vmcb->control.exit_info_2 = fault->address; } - svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; - svm->vmcb->control.exit_info_1 |= fault->error_code; + vmcb->control.exit_info_1 &= ~0xffffffffULL; + vmcb->control.exit_info_1 |= fault->error_code; nested_svm_vmexit(svm); } -static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) +static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - struct vcpu_svm *svm = to_svm(vcpu); - WARN_ON(!is_guest_mode(vcpu)); + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + + WARN_ON(!is_guest_mode(vcpu)); if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; - nested_svm_vmexit(svm); - } else { - kvm_inject_page_fault(vcpu, fault); - } + !WARN_ON_ONCE(svm->nested.nested_run_pending)) { + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; + vmcb->control.exit_code_hi = 0; + vmcb->control.exit_info_1 = fault->error_code; + vmcb->control.exit_info_2 = fault->address; + nested_svm_vmexit(svm); + return true; + } + + return false; } static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) @@ -121,6 +126,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } +static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) +{ + if (!svm->v_vmload_vmsave_enabled) + return true; + + if (!nested_npt_enabled(svm)) + return true; + + if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) + return true; + + return false; +} + void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; @@ -162,8 +181,17 @@ void recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(c, INTERCEPT_SMI); - vmcb_set_intercept(c, INTERCEPT_VMLOAD); - vmcb_set_intercept(c, INTERCEPT_VMSAVE); + if (nested_vmcb_needs_vls_intercept(svm)) { + /* + * If the virtual VMLOAD/VMSAVE is not enabled for the L2, + * we must intercept these instructions to correctly + * emulate them in case L1 doesn't intercept them. + */ + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); + } else { + WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); + } } /* @@ -413,6 +441,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) */ mask &= ~V_IRQ_MASK; } + + if (nested_vgif_enabled(svm)) + mask |= V_GIF_MASK; + svm->nested.ctl.int_ctl &= ~mask; svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; } @@ -454,11 +486,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, vmcb12->control.exit_int_info = exit_int_info; } -static inline bool nested_npt_enabled(struct vcpu_svm *svm) -{ - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; -} - static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { /* @@ -515,6 +542,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { bool new_vmcb12 = false; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; nested_vmcb02_compute_g_pat(svm); @@ -526,18 +555,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.cpl = vmcb12->save.cpl; - vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + vmcb02->save.es = vmcb12->save.es; + vmcb02->save.cs = vmcb12->save.cs; + vmcb02->save.ss = vmcb12->save.ss; + vmcb02->save.ds = vmcb12->save.ds; + vmcb02->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(vmcb02, VMCB_SEG); } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; - vmcb_mark_dirty(svm->vmcb, VMCB_DT); + vmcb02->save.gdtr = vmcb12->save.gdtr; + vmcb02->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(vmcb02, VMCB_DT); } kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); @@ -554,47 +583,59 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = vmcb12->save.rax; - svm->vmcb->save.rsp = vmcb12->save.rsp; - svm->vmcb->save.rip = vmcb12->save.rip; + vmcb02->save.rax = vmcb12->save.rax; + vmcb02->save.rsp = vmcb12->save.rsp; + vmcb02->save.rip = vmcb12->save.rip; /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; - vmcb_mark_dirty(svm->vmcb, VMCB_DR); + vmcb_mark_dirty(vmcb02, VMCB_DR); + } + + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + /* + * Reserved bits of DEBUGCTL are ignored. Be consistent with + * svm_set_msr's definition of reserved bits. + */ + svm_copy_lbrs(vmcb02, vmcb12); + vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; + svm_update_lbrv(&svm->vcpu); + + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb02, vmcb01); } } static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { - const u32 int_ctl_vmcb01_bits = - V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; - - const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; + u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - /* - * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, - * avic_physical_id. - */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); + else + int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); /* Copied from vmcb01. msrpm_base can be overwritten later. */ - svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; - svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; - svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; + vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; + vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* nested_cr3. */ if (nested_npt_enabled(svm)) @@ -605,21 +646,53 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->nested.ctl.tsc_offset, svm->tsc_ratio_msr); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } - svm->vmcb->control.int_ctl = + vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | - (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); - - svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; - svm->vmcb->control.int_state = svm->nested.ctl.int_state; - svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; - svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); + + vmcb02->control.int_vector = svm->nested.ctl.int_vector; + vmcb02->control.int_state = svm->nested.ctl.int_state; + vmcb02->control.event_inj = svm->nested.ctl.event_inj; + vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + + vmcb02->control.virt_ext = vmcb01->control.virt_ext & + LBR_CTL_ENABLE_MASK; + if (svm->lbrv_enabled) + vmcb02->control.virt_ext |= + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + + if (!nested_vmcb_needs_vls_intercept(svm)) + vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + if (kvm_pause_in_guest(svm->vcpu.kvm)) { + /* use guest values since host doesn't use them */ + vmcb02->control.pause_filter_count = + svm->pause_filter_enabled ? + svm->nested.ctl.pause_filter_count : 0; + + vmcb02->control.pause_filter_thresh = + svm->pause_threshold_enabled ? + svm->nested.ctl.pause_filter_thresh : 0; + + } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { + /* use host values when guest doesn't use them */ + vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; + vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; + } else { + /* + * Intercept every PAUSE otherwise and + * ignore both host and guest values + */ + vmcb02->control.pause_filter_count = 0; + vmcb02->control.pause_filter_thresh = 0; + } nested_svm_transition_tlb_flush(vcpu); @@ -680,14 +753,14 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, if (ret) return ret; - if (!npt_enabled) - vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; - if (!from_vmrun) kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); svm_set_gif(svm, true); + if (kvm_vcpu_apicv_active(vcpu)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + return 0; } @@ -698,6 +771,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct vmcb *vmcb12; struct kvm_host_map map; u64 vmcb12_gpa; + struct vmcb *vmcb01 = svm->vmcb01.ptr; if (!svm->nested.hsave_msr) { kvm_inject_gp(vcpu, 0); @@ -741,14 +815,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) * Since vmcb01 is not in use, we can use it to store some of the L1 * state. */ - svm->vmcb01.ptr->save.efer = vcpu->arch.efer; - svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); - svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; - svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); - svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + vmcb01->save.efer = vcpu->arch.efer; + vmcb01->save.cr0 = kvm_read_cr0(vcpu); + vmcb01->save.cr4 = vcpu->arch.cr4; + vmcb01->save.rflags = kvm_get_rflags(vcpu); + vmcb01->save.rip = kvm_rip_read(vcpu); if (!npt_enabled) - svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); + vmcb01->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; @@ -814,14 +888,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb12; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; int rc; - /* Triple faults in L2 should never escape. */ - WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) @@ -843,57 +915,68 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - vmcb12->save.es = vmcb->save.es; - vmcb12->save.cs = vmcb->save.cs; - vmcb12->save.ss = vmcb->save.ss; - vmcb12->save.ds = vmcb->save.ds; - vmcb12->save.gdtr = vmcb->save.gdtr; - vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.es = vmcb02->save.es; + vmcb12->save.cs = vmcb02->save.cs; + vmcb12->save.ss = vmcb02->save.ss; + vmcb12->save.ds = vmcb02->save.ds; + vmcb12->save.gdtr = vmcb02->save.gdtr; + vmcb12->save.idtr = vmcb02->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; vmcb12->save.cr0 = kvm_read_cr0(vcpu); vmcb12->save.cr3 = kvm_read_cr3(vcpu); - vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr2 = vmcb02->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; vmcb12->save.rflags = kvm_get_rflags(vcpu); vmcb12->save.rip = kvm_rip_read(vcpu); vmcb12->save.rsp = kvm_rsp_read(vcpu); vmcb12->save.rax = kvm_rax_read(vcpu); - vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr7 = vmcb02->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; - vmcb12->save.cpl = vmcb->save.cpl; + vmcb12->save.cpl = vmcb02->save.cpl; - vmcb12->control.int_state = vmcb->control.int_state; - vmcb12->control.exit_code = vmcb->control.exit_code; - vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; - vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; - vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + vmcb12->control.int_state = vmcb02->control.int_state; + vmcb12->control.exit_code = vmcb02->control.exit_code; + vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) - vmcb12->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; + if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count) + vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + svm_copy_lbrs(vmcb12, vmcb02); + svm_update_lbrv(vcpu); + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb01, vmcb02); + svm_update_lbrv(vcpu); + } + /* * On vmexit the GIF is set to false and * no event can be injected in L1. */ svm_set_gif(svm, false); - svm->vmcb->control.exit_int_info = 0; + vmcb01->control.exit_int_info = 0; svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; - if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; - vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { @@ -907,13 +990,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* * Restore processor state that had been saved in vmcb01 */ - kvm_set_rflags(vcpu, svm->vmcb->save.rflags); - svm_set_efer(vcpu, svm->vmcb->save.efer); - svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); - svm_set_cr4(vcpu, svm->vmcb->save.cr4); - kvm_rax_write(vcpu, svm->vmcb->save.rax); - kvm_rsp_write(vcpu, svm->vmcb->save.rsp); - kvm_rip_write(vcpu, svm->vmcb->save.rip); + kvm_set_rflags(vcpu, vmcb01->save.rflags); + svm_set_efer(vcpu, vmcb01->save.efer); + svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, vmcb01->save.cr4); + kvm_rax_write(vcpu, vmcb01->save.rax); + kvm_rsp_write(vcpu, vmcb01->save.rsp); + kvm_rip_write(vcpu, vmcb01->save.rip); svm->vcpu.arch.dr7 = DR7_FIXED_1; kvm_update_dr7(&svm->vcpu); @@ -931,7 +1014,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); + rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); if (rc) return 1; @@ -949,9 +1032,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm) * right now so that it an be accounted for before we execute * L1's next instruction. */ - if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF)) + if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF)) kvm_queue_exception(&(svm->vcpu), DB_VECTOR); + /* + * Un-inhibit the AVIC right away, so that other vCPUs can start + * to benefit from it right away. + */ + if (kvm_apicv_activated(vcpu->kvm)) + kvm_vcpu_update_apicv(vcpu); + return 0; } @@ -1162,12 +1252,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm) static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) { unsigned int nr = svm->vcpu.arch.exception.nr; + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + vmcb->control.exit_code_hi = 0; if (svm->vcpu.arch.exception.has_error_code) - svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; + vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; /* * EXITINFO2 is undefined for all exception intercepts other @@ -1175,11 +1266,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) */ if (nr == PF_VECTOR) { if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; } else if (nr == DB_VECTOR) { /* See inject_pending_event. */ kvm_deliver_exception_payload(&svm->vcpu); @@ -1567,6 +1658,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) struct kvm_x86_nested_ops svm_nested_ops = { .leave_nested = svm_leave_nested, .check_events = svm_check_nested_events, + .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 16a5ebb420cf..136039fc6d01 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -342,7 +342,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } } -struct kvm_pmu_ops amd_pmu_ops = { +struct kvm_pmu_ops amd_pmu_ops __initdata = { .pmc_perf_hw_id = amd_pmc_perf_hw_id, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 636c77ef55fc..51fd985cf21d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -688,7 +688,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; - blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT); + blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); if (!blob) return -ENOMEM; @@ -808,7 +808,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED(dst_paddr, 16) || !IS_ALIGNED(paddr, 16) || !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL); + tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO); if (!tpage) return -ENOMEM; @@ -1094,7 +1094,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; - blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT); + blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); if (!blob) return -ENOMEM; @@ -1176,7 +1176,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; /* allocate the memory to hold the session data blob */ - session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT); + session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); if (!session_data) return -ENOMEM; @@ -1300,11 +1300,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); if (!hdr) goto e_unpin; - trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); if (!trans_data) goto e_free_hdr; @@ -2769,8 +2769,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - ret = -EINVAL; - break; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } default: /* Error, keep GHCB MSR value as-is */ @@ -2953,6 +2957,14 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && + (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) { + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); + if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP)) + svm_clr_intercept(svm, INTERCEPT_RDTSCP); + } } void sev_es_vcpu_reset(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 17d334ef5430..200045f71df0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -62,8 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - static bool erratum_383_found __read_mostly; u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; @@ -101,6 +99,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_EFER, .always = false }, { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, + { .index = MSR_TSC_AUX, .always = false }, { .index = MSR_INVALID, .always = false }, }; @@ -172,7 +171,7 @@ static int vls = true; module_param(vls, int, 0444); /* enable/disable Virtual GIF */ -static int vgif = true; +int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ @@ -189,6 +188,9 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -790,6 +792,17 @@ static void init_msrpm_offsets(void) } } +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) +{ + to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; + to_vmcb->save.br_from = from_vmcb->save.br_from; + to_vmcb->save.br_to = from_vmcb->save.br_to; + to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; + to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; + + vmcb_mark_dirty(to_vmcb, VMCB_LBR); +} + static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -799,6 +812,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); } static void svm_disable_lbrv(struct kvm_vcpu *vcpu) @@ -810,6 +827,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + + /* + * Move the LBR msrs back to the vmcb01 to avoid copying them + * on nested guest entries. + */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); +} + +static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +{ + /* + * If the LBR virtualization is disabled, the LBR msrs are always + * kept in the vmcb01 to avoid copying them on nested guest entries. + * + * If nested, and the LBR virtualization is enabled/disabled, the msrs + * are moved between the vmcb01 and vmcb02 as needed. + */ + struct vmcb *vmcb = + (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? + svm->vmcb : svm->vmcb01.ptr; + + switch (index) { + case MSR_IA32_DEBUGCTLMSR: + return vmcb->save.dbgctl; + case MSR_IA32_LASTBRANCHFROMIP: + return vmcb->save.br_from; + case MSR_IA32_LASTBRANCHTOIP: + return vmcb->save.br_to; + case MSR_IA32_LASTINTFROMIP: + return vmcb->save.last_excp_from; + case MSR_IA32_LASTINTTOIP: + return vmcb->save.last_excp_to; + default: + KVM_BUG(false, svm->vcpu.kvm, + "%s: Unknown MSR 0x%x", __func__, index); + return 0; + } +} + +void svm_update_lbrv(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & + DEBUGCTLMSR_LBR; + + bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & + LBR_CTL_ENABLE_MASK); + + if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) + if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) + enable_lbrv = true; + + if (enable_lbrv == current_enable_lbrv) + return; + + if (enable_lbrv) + svm_enable_lbrv(vcpu); + else + svm_disable_lbrv(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -831,6 +909,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; + if (kvm_pause_in_guest(vcpu->kvm) || !old) + return; + control->pause_filter_count = __grow_ple_window(old, pause_filter_count, pause_filter_count_grow, @@ -849,6 +930,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; + if (kvm_pause_in_guest(vcpu->kvm) || !old) + return; + control->pause_filter_count = __shrink_ple_window(old, pause_filter_count, @@ -960,6 +1044,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + + svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -979,8 +1065,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - struct vmcb_save_area *save = &svm->vmcb->save; + struct vmcb *vmcb = svm->vmcb01.ptr; + struct vmcb_control_area *control = &vmcb->control; + struct vmcb_save_area *save = &vmcb->save; svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); @@ -1104,7 +1191,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); if (kvm_vcpu_apicv_active(vcpu)) - avic_init_vmcb(svm); + avic_init_vmcb(svm, vmcb); if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); @@ -1122,10 +1209,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } } - svm_hv_init_vmcb(svm->vmcb); + svm_hv_init_vmcb(vmcb); init_vmcb_after_set_cpuid(vcpu); - vmcb_mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(vmcb); enable_gif(svm); } @@ -1380,7 +1467,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) /* * The following fields are ignored when AVIC is enabled */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); svm_set_intercept(svm, INTERCEPT_VINTR); @@ -2142,7 +2229,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * Likewise, clear the VINTR intercept, we will set it * again while processing KVM_REQ_EVENT if needed. */ - if (vgif_enabled(svm)) + if (vgif) svm_clr_intercept(svm, INTERCEPT_STGI); if (svm_is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); @@ -2160,7 +2247,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * in use, we still rely on the VINTR intercept (rather than * STGI) to detect an open interrupt window. */ - if (!vgif_enabled(svm)) + if (!vgif) svm_clear_vintr(svm); } } @@ -2575,25 +2662,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; - /* - * Nobody will change the following 5 values in the VMCB so we can - * safely return them on rdmsr. They will always be 0 until LBRV is - * implemented. - */ case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm->vmcb->save.dbgctl; - break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm->vmcb->save.br_from; - break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm->vmcb->save.br_to; - break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm->vmcb->save.last_excp_from; - break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm->vmcb->save.last_excp_to; + msr_info->data = svm_get_lbr_msr(svm, msr_info->index); break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -2839,12 +2913,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - svm->vmcb->save.dbgctl = data; - vmcb_mark_dirty(svm->vmcb, VMCB_LBR); - if (data & (1ULL<<0)) - svm_enable_lbrv(vcpu); + if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) + svm->vmcb->save.dbgctl = data; else - svm_disable_lbrv(vcpu); + svm->vmcb01.ptr->save.dbgctl = data; + + svm_update_lbrv(vcpu); + break; case MSR_VM_HSAVE_PA: /* @@ -2901,9 +2976,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) svm_clear_vintr(to_svm(vcpu)); /* - * For AVIC, the only reason to end up here is ExtINTs. + * If not running nested, for AVIC, the only reason to end up here is ExtINTs. * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. + * + * If running nested, still remove the VM wide AVIC inhibit to + * support case in which the interrupt window was requested when the + * vCPU was not running nested. + + * All vCPUs which run still run nested, will remain to have their + * AVIC still inhibited due to per-cpu AVIC inhibition. */ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); @@ -2914,7 +2996,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) static int pause_interception(struct kvm_vcpu *vcpu) { bool in_kernel; - /* * CPL is not made available for an SEV-ES guest, therefore * vcpu->arch.preempted_in_kernel can never be true. Just @@ -2922,8 +3003,7 @@ static int pause_interception(struct kvm_vcpu *vcpu) */ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; - if (!kvm_pause_in_guest(vcpu->kvm)) - grow_ple_window(vcpu); + grow_ple_window(vcpu); kvm_vcpu_on_spin(vcpu, in_kernel); return kvm_skip_emulated_instruction(vcpu); @@ -3496,14 +3576,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * enabled, the STGI interception will not occur. Enable the irq * window under the assumption that the hardware will set the GIF. */ - if (vgif_enabled(svm) || gif_set(svm)) { + if (vgif || gif_set(svm)) { /* * IRQ window is not needed when AVIC is enabled, * unless we have pending ExtINT since it cannot be injected - * via AVIC. In such case, we need to temporarily disable AVIC, + * via AVIC. In such case, KVM needs to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. + * + * If running nested, AVIC is already locally inhibited + * on this vCPU, therefore there is no need to request + * the VM wide AVIC inhibition. */ - kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); + if (!is_guest_mode(vcpu)) + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); + svm_set_vintr(svm); } } @@ -3516,7 +3602,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { - if (vgif_enabled(svm)) + if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); return; /* STGI will cause a vm exit */ } @@ -3865,7 +3951,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ @@ -3946,6 +4032,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); + + svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); + + svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && + guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); + + svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && + guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); + + svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); svm_recalc_instruction_intercepts(vcpu, svm); @@ -3963,13 +4060,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) */ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - - /* - * Currently, AVIC does not work with nested virtualization. - * So, we disable AVIC when cpuid for SVM is set in the L1 guest. - */ - if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED); } init_vmcb_after_set_cpuid(vcpu); } @@ -4224,7 +4314,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - ret = nested_svm_vmexit(svm); + ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); if (ret) return ret; @@ -4321,7 +4411,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!gif_set(svm)) { - if (vgif_enabled(svm)) + if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ } else { @@ -4605,7 +4695,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, - .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, @@ -4633,6 +4722,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, + .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, }; /* @@ -4696,6 +4786,20 @@ static __init void svm_set_cpu_caps(void) if (tsc_scaling) kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + if (vls) + kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); + if (lbrv) + kvm_cpu_cap_set(X86_FEATURE_LBRV); + + if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) + kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); + + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) + kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); + + if (vgif) + kvm_cpu_cap_set(X86_FEATURE_VGIF); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -4789,6 +4893,9 @@ static __init int svm_hardware_setup(void) get_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + /* Setup shadow_me_value and shadow_me_mask */ + kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); + /* Note, SEV setup consumes npt_enabled. */ sev_hardware_setup(); @@ -4807,15 +4914,20 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); + enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); if (enable_apicv) { - pr_info("AVIC enabled\n"); + if (!boot_cpu_has(X86_FEATURE_AVIC)) { + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } else + pr_info("AVIC enabled\n"); amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } else { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; + svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; } if (vls) { @@ -4880,6 +4992,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2d83845b9032..21c5460e947a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -29,10 +29,11 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 20 +#define MAX_DIRECT_ACCESS_MSRS 21 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern int vgif; extern bool intercept_smi; /* @@ -231,9 +232,14 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; - /* cached guest cpuid flags for faster access */ + /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; bool tsc_scaling_enabled : 1; + bool v_vmload_vmsave_enabled : 1; + bool lbrv_enabled : 1; + bool pause_filter_enabled : 1; + bool pause_threshold_enabled : 1; + bool vgif_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -452,44 +458,70 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) return vmcb_is_intercept(&svm->vmcb->control, bit); } -static inline bool vgif_enabled(struct vcpu_svm *svm) +static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); + return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); +} + +static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm) +{ + if (!vgif) + return NULL; + + if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm)) + return svm->nested.vmcb02.ptr; + else + return svm->vmcb01.ptr; } static inline void enable_gif(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl |= V_GIF_MASK; + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + vmcb->control.int_ctl |= V_GIF_MASK; else svm->vcpu.arch.hflags |= HF_GIF_MASK; } static inline void disable_gif(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl &= ~V_GIF_MASK; + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + vmcb->control.int_ctl &= ~V_GIF_MASK; else svm->vcpu.arch.hflags &= ~HF_GIF_MASK; } static inline bool gif_set(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + return !!(vmcb->control.int_ctl & V_GIF_MASK); else return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } +static inline bool nested_npt_enabled(struct vcpu_svm *svm) +{ + return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; +} + /* svm.c */ #define MSR_INVALID 0xffffffffU +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + extern bool dump_invalid_vmcb; u32 svm_msrpm_offset(u32 msr); u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); @@ -574,7 +606,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -void avic_init_vmcb(struct vcpu_svm *svm); +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); @@ -592,6 +624,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e3a24b8f04be..de4762517569 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1459,6 +1459,26 @@ TRACE_EVENT(kvm_avic_ga_log, __entry->vmid, __entry->vcpuid) ); +TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, + TP_PROTO(u32 icrh, u32 icrl, u32 index), + TP_ARGS(icrh, icrl, index), + + TP_STRUCT__entry( + __field(u32, icrh) + __field(u32, icrl) + __field(u32, index) + ), + + TP_fast_assign( + __entry->icrh = icrh; + __entry->icrl = icrl; + __entry->index = index; + ), + + TP_printk("icrh:icrl=%#08x:%08x, index=%u", + __entry->icrh, __entry->icrl, __entry->index) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 856c87563883..f5cb18e00e78 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -476,24 +476,23 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit return 0; } - -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, - struct x86_exception *fault) +static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); WARN_ON(!is_guest_mode(vcpu)); if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && - !to_vmx(vcpu)->nested.nested_run_pending) { + !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) { vmcs12->vm_exit_intr_error_code = fault->error_code; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, fault->address); - } else { - kvm_inject_page_fault(vcpu, fault); + return true; } + return false; } static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, @@ -2614,9 +2613,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { @@ -3695,12 +3691,34 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) + struct vmcs12 *vmcs12, + u32 vm_exit_reason, u32 exit_intr_info) { u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.injected) { + /* + * Per the SDM, VM-Exits due to double and triple faults are never + * considered to occur during event delivery, even if the double/triple + * fault is the result of an escalating vectoring issue. + * + * Note, the SDM qualifies the double fault behavior with "The original + * event results in a double-fault exception". It's unclear why the + * qualification exists since exits due to double fault can occur only + * while vectoring a different exception (injected events are never + * subject to interception), i.e. there's _always_ an original event. + * + * The SDM also uses NMI as a confusing example for the "original event + * causes the VM exit directly" clause. NMI isn't special in any way, + * the same rule applies to all events that cause an exit directly. + * NMI is an odd choice for the example because NMIs can only occur on + * instruction boundaries, i.e. they _can't_ occur during vectoring. + */ + if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || + ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && + is_double_fault(exit_intr_info))) { + vmcs12->idt_vectoring_info_field = 0; + } else if (vcpu->arch.exception.injected) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -3733,6 +3751,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, idt_vectoring |= INTR_TYPE_EXT_INTR; vmcs12->idt_vectoring_info_field = idt_vectoring; + } else { + vmcs12->idt_vectoring_info_field = 0; } } @@ -4202,12 +4222,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (to_vmx(vcpu)->exit_reason.enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; - - vmcs12->idt_vectoring_info_field = 0; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + /* + * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched + * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other + * exit info fields are unmodified. + */ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { vmcs12->launch_state = 1; @@ -4219,7 +4239,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Transfer the event that L0 or L1 may wanted to inject into * L2 to IDT_VECTORING_INFO_FIELD. */ - vmcs12_save_pending_event(vcpu, vmcs12); + vmcs12_save_pending_event(vcpu, vmcs12, + vm_exit_reason, exit_intr_info); + + vmcs12->vm_exit_intr_info = exit_intr_info; + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* * According to spec, there's no need to store the guest's @@ -4518,9 +4543,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); - /* Similarly, triple faults in L2 should never escape. */ - WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map @@ -6809,6 +6831,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, .check_events = vmx_check_nested_events, + .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround, .hv_timer_pending = nested_vmx_preemption_timer_pending, .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b82b6709d7a8..37e9eb32e3d9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -719,7 +719,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } -struct kvm_pmu_ops intel_pmu_ops = { +struct kvm_pmu_ops intel_pmu_ops __initdata = { .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 3834bb30ce54..07e5fcf5a5aa 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -202,16 +202,17 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) void pi_wakeup_handler(void) { int cpu = smp_processor_id(); + struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); struct vcpu_vmx *vmx; - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); - list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), - pi_wakeup_list) { + raw_spin_lock(spinlock); + list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { if (pi_test_on(&vmx->pi_desc)) kvm_vcpu_wake_up(&vmx->vcpu); } - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + raw_spin_unlock(spinlock); } void __init pi_init_cpu(int cpu) @@ -311,7 +312,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, continue; } - vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc); + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index e325c290a816..2b9d7a7e83f7 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -104,6 +104,11 @@ static inline bool is_breakpoint(u32 intr_info) return is_exception_n(intr_info, BP_VECTOR); } +static inline bool is_double_fault(u32 intr_info) +{ + return is_exception_n(intr_info, DF_VECTOR); +} + static inline bool is_page_fault(u32 intr_info) { return is_exception_n(intr_info, PF_VECTOR); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 610355b9ccce..f5aeade623d6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2444,7 +2444,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_cpu_based_exec_control) < 0) return -EIO; #ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & ~CPU_BASED_CR8_STORE_EXITING; #endif @@ -2948,7 +2948,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, - mmu->shadow_root_level)); + mmu->root_role.level)); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } @@ -4385,7 +4385,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); - if (kvm_vcpu_apicv_active(&vmx->vcpu)) { + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -5410,9 +5410,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ - error_code |= (exit_qualification & - (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | - EPT_VIOLATION_EXECUTABLE)) + error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) ? PFERR_PRESENT_MASK : 0; error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? @@ -7823,7 +7821,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, .pi_update_irte = vmx_pi_update_irte, @@ -7856,7 +7853,7 @@ static unsigned int vmx_handle_intel_pt_intr(void) struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* '0' on failure so that the !PT case can use a RET0 static call. */ - if (!kvm_arch_pmi_in_guest(vcpu)) + if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) return 0; kvm_make_request(KVM_REQ_PMI, vcpu); @@ -7891,6 +7888,31 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static void __init vmx_setup_me_spte_mask(void) +{ + u64 me_mask = 0; + + /* + * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use + * the former to avoid exposing shadow_phys_bits. + * + * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to + * shadow_phys_bits. On MKTME and/or TDX capable systems, + * boot_cpu_data.x86_phys_bits holds the actual physical address + * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR + * reported by CPUID. Those bits between are KeyID bits. + */ + if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, + kvm_get_shadow_phys_bits() - 1); + /* + * Unlike SME, host kernel doesn't support setting up any + * MKTME KeyID on Intel platforms. No memory encryption + * bits should be included into the SPTE. + */ + kvm_mmu_set_me_spte_mask(0, me_mask); +} + static struct kvm_x86_init_ops vmx_init_ops __initdata; static __init int hardware_setup(void) @@ -7993,6 +8015,12 @@ static __init int hardware_setup(void) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + /* + * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID + * bits to shadow_zero_check. + */ + vmx_setup_me_spte_mask(); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), ept_caps_to_lpage_level(vmx_capability.ept)); @@ -8077,6 +8105,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, + .pmu_ops = &intel_pmu_ops, }; static void vmx_cleanup_l1d_flush(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4790f0d7d40b..b81ef4f497f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -266,7 +266,12 @@ const struct kvm_stats_header kvm_vm_stats_header = { const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, pf_taken), STATS_DESC_COUNTER(VCPU, pf_fixed), + STATS_DESC_COUNTER(VCPU, pf_emulate), + STATS_DESC_COUNTER(VCPU, pf_spurious), + STATS_DESC_COUNTER(VCPU, pf_fast), + STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created), STATS_DESC_COUNTER(VCPU, pf_guest), STATS_DESC_COUNTER(VCPU, tlb_flush), STATS_DESC_COUNTER(VCPU, invlpg), @@ -748,6 +753,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); +/* Returns true if the page fault was immediately morphed into a VM-Exit. */ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -766,8 +772,26 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, fault_mmu->root.hpa); + /* + * A workaround for KVM's bad exception handling. If KVM injected an + * exception into L2, and L2 encountered a #PF while vectoring the + * injected exception, manually check to see if L1 wants to intercept + * #PF, otherwise queuing the #PF will lead to #DF or a lost exception. + * In all other cases, defer the check to nested_ops->check_events(), + * which will correctly handle priority (this does not). Note, other + * exceptions, e.g. #GP, are theoretically affected, #PF is simply the + * most problematic, e.g. when L0 and L1 are both intercepting #PF for + * shadow paging. + * + * TODO: Rewrite exception handling to track injected and pending + * (VM-Exit) exceptions separately. + */ + if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) && + kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault)) + return true; + fault_mmu->inject_page_fault(vcpu, fault); - return fault->nested_page_fault; + return false; } EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); @@ -961,11 +985,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (static_cpu_has(X86_FEATURE_PKU) && - (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || - (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && - vcpu->arch.pkru != vcpu->arch.host_pkru) + vcpu->arch.pkru != vcpu->arch.host_pkru && + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -974,13 +1000,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (static_cpu_has(X86_FEATURE_PKU) && - (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || - (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { @@ -2249,14 +2277,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(system_time & 1)) - return; - - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + if (system_time & 1) { + kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); + } else { + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + } return; } @@ -2961,63 +2988,55 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_pvclock_page(struct kvm_vcpu *v, - struct gfn_to_hva_cache *cache, - unsigned int offset) +static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) { struct kvm_vcpu_arch *vcpu = &v->arch; - struct pvclock_vcpu_time_info guest_hv_clock; + struct pvclock_vcpu_time_info *guest_hv_clock; + unsigned long flags; - if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, - &guest_hv_clock, offset, sizeof(guest_hv_clock)))) - return; + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); - /* This VCPU is paused, but it's legal for a guest to read another + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) + return; + + read_lock_irqsave(&gpc->lock, flags); + } + + guest_hv_clock = (void *)(gpc->khva + offset); + + /* + * This VCPU is paused, but it's legal for a guest to read another * VCPU's kvmclock, so we really have to follow the specification where * it says that version is odd if data is being modified, and even after * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - if (guest_hv_clock.version & 1) - ++guest_hv_clock.version; /* first time write, random junk */ - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock.version)); + guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); + smp_wmb(); - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock)); + guest_hv_clock->version = ++vcpu->hv_clock.version; - smp_wmb(); + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + read_unlock_irqrestore(&gpc->lock, flags); - vcpu->hv_clock.version++; - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock.version)); + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -3106,13 +3125,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); - if (vcpu->xen.vcpu_info_set) - kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time)); - if (vcpu->xen.vcpu_time_info_set) - kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); + if (vcpu->pv_time.active) + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); + if (vcpu->xen.vcpu_info_cache.active) + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (vcpu->xen.vcpu_time_info_cache.active) + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -3300,7 +3319,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.pv_time_enabled = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -4284,7 +4303,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO | - KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL; + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | + KVM_XEN_HVM_CONFIG_EVTCHN_SEND; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE; break; @@ -4331,6 +4351,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: + case KVM_CAP_VM_TSC_CONTROL: r = kvm_has_tsc_control; break; case KVM_CAP_X2APIC_API: @@ -5102,7 +5123,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.pv_time_enabled) + if (!vcpu->arch.pv_time.active) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -6186,7 +6207,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time_enabled) + if (!vcpu->arch.pv_time.active) continue; ret = kvm_set_guest_paused(vcpu); @@ -6513,6 +6534,15 @@ set_pit2_out: r = kvm_xen_hvm_set_attr(kvm, &xha); break; } + case KVM_XEN_HVM_EVTCHN_SEND: { + struct kvm_irq_routing_xen_evtchn uxe; + + r = -EFAULT; + if (copy_from_user(&uxe, argp, sizeof(uxe))) + goto out; + r = kvm_xen_hvm_evtchn_send(kvm, &uxe); + break; + } #endif case KVM_SET_CLOCK: r = kvm_vm_ioctl_set_clock(kvm, argp); @@ -6520,6 +6550,28 @@ set_pit2_out: case KVM_GET_CLOCK: r = kvm_vm_ioctl_get_clock(kvm, argp); break; + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; + + r = -EINVAL; + user_tsc_khz = (u32)arg; + + if (kvm_has_tsc_control && + user_tsc_khz >= kvm_max_guest_tsc_khz) + goto out; + + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); + r = 0; + + goto out; + } + case KVM_GET_TSC_KHZ: { + r = READ_ONCE(kvm->arch.default_tsc_khz); + goto out; + } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (!kvm_x86_ops.mem_enc_ioctl) @@ -7229,15 +7281,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, exception, &write_emultor); } -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif +#define emulator_try_cmpxchg_user(t, ptr, old, new) \ + (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t)) static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -7246,12 +7291,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned int bytes, struct x86_exception *exception) { - struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u64 page_line_mask; + unsigned long hva; gpa_t gpa; - char *kaddr; - bool exchanged; + int r; /* guests cmpxchg8b have to be emulated atomically */ if (bytes > 8 || (bytes & (bytes - 1))) @@ -7275,31 +7319,32 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) goto emul_write; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) + hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) goto emul_write; - kaddr = map.hva + offset_in_page(gpa); + hva += offset_in_page(gpa); switch (bytes) { case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + r = emulator_try_cmpxchg_user(u8, hva, old, new); break; case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + r = emulator_try_cmpxchg_user(u16, hva, old, new); break; case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + r = emulator_try_cmpxchg_user(u32, hva, old, new); break; case 8: - exchanged = CMPXCHG64(kaddr, old, new); + r = emulator_try_cmpxchg_user(u64, hva, old, new); break; default: BUG(); } - kvm_vcpu_unmap(vcpu, &map, true); - - if (!exchanged) + if (r < 0) + return X86EMUL_UNHANDLEABLE; + if (r) return X86EMUL_CMPXCHG_FAILED; kvm_page_track_write(vcpu, gpa, new, bytes); @@ -8061,7 +8106,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; - if (!vcpu->arch.mmu->direct_map) { + if (!vcpu->arch.mmu->root_role.direct) { /* * Write permission should be allowed since only * write access need to be emulated. @@ -8094,7 +8139,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_release_pfn_clean(pfn); /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu->direct_map) { + if (vcpu->arch.mmu->root_role.direct) { unsigned int indirect_shadow_pages; write_lock(&vcpu->kvm->mmu_lock); @@ -8162,7 +8207,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, vcpu->arch.last_retry_eip = ctxt->eip; vcpu->arch.last_retry_addr = cr2_or_gpa; - if (!vcpu->arch.mmu->direct_map) + if (!vcpu->arch.mmu->root_role.direct) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -8251,7 +8296,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); -static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { @@ -8320,25 +8365,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) } /* - * Decode to be emulated instruction. Return EMULATION_OK if success. + * Decode an instruction for emulation. The caller is responsible for handling + * code breakpoints. Note, manually detecting code breakpoints is unnecessary + * (and wrong) when emulating on an intercepted fault-like exception[*], as + * code breakpoints have higher priority and thus have already been done by + * hardware. + * + * [*] Except #MC, which is higher priority, but KVM should never emulate in + * response to a machine check. */ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len) { - int r = EMULATION_OK; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + int r; init_emulate_ctxt(vcpu); - /* - * We will reenter on the same instruction since we do not set - * complete_userspace_io. This does not handle watchpoints yet, - * those would be handled in the emulate_ops. - */ - if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_breakpoint(vcpu, &r)) - return r; - r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); trace_kvm_emulate_insn_start(vcpu); @@ -8371,6 +8414,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); + /* + * Return immediately if RIP hits a code breakpoint, such #DBs + * are fault-like and are higher priority than any faults on + * the code fetch itself. + */ + if (!(emulation_type & EMULTYPE_SKIP) && + kvm_vcpu_check_code_breakpoint(vcpu, &r)) + return r; + r = x86_decode_emulated_instruction(vcpu, emulation_type, insn, insn_len); if (r != EMULATION_OK) { @@ -8442,7 +8494,7 @@ restart: ctxt->exception.address = cr2_or_gpa; /* With shadow page tables, cr2 contains a GVA or nGPA. */ - if (vcpu->arch.mmu->direct_map) { + if (vcpu->arch.mmu->root_role.direct) { ctxt->gpa_available = true; ctxt->gpa_val = cr2_or_gpa; } @@ -8789,22 +8841,22 @@ static int kvmclock_cpu_online(unsigned int cpu) static void kvm_timer_init(void) { - max_tsc_khz = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy *policy; - int cpu; - - cpu = get_cpu(); - policy = cpufreq_cpu_get(cpu); - if (policy) { - if (policy->cpuinfo.max_freq) - max_tsc_khz = policy->cpuinfo.max_freq; - cpufreq_cpu_put(policy); + max_tsc_khz = tsc_khz; + + if (IS_ENABLED(CONFIG_CPU_FREQ)) { + struct cpufreq_policy *policy; + int cpu; + + cpu = get_cpu(); + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } + put_cpu(); } - put_cpu(); -#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } @@ -9089,6 +9141,14 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) +{ + ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); + ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu); + + return (vm_reasons | vcpu_reasons) == 0; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) @@ -9266,6 +9326,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); + /* + * If the quirk is disabled, synthesize a #UD and let the guest pick up + * the pieces. + */ + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { + ctxt->exception.error_code_valid = false; + ctxt->exception.vector = UD_VECTOR; + ctxt->have_exception = true; + return X86EMUL_PROPAGATE_FAULT; + } + static_call(kvm_x86_patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, @@ -9763,7 +9834,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); - activate = kvm_apicv_activated(vcpu->kvm); + activate = kvm_vcpu_apicv_activated(vcpu); + if (vcpu->arch.apicv_active == activate) goto out; @@ -10171,7 +10243,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) @@ -10250,14 +10322,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ guest_timing_exit_irqoff(); - if (lapic_in_kernel(vcpu)) { - s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; - if (delta != S64_MIN) { - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); - vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; - } - } - local_irq_enable(); preempt_enable(); @@ -10368,6 +10432,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); + if (kvm_xen_has_pending_events(vcpu)) + kvm_xen_inject_pending_events(vcpu); + if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -11263,9 +11330,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + kvm_xen_init_vcpu(vcpu); kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); - kvm_set_tsc_khz(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu); vcpu_put(vcpu); @@ -11320,6 +11388,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); + kvm_xen_destroy_vcpu(vcpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); @@ -11581,6 +11650,24 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + int kvm_arch_hardware_setup(void *opaque) { struct kvm_x86_init_ops *ops = opaque; @@ -11595,8 +11682,7 @@ int kvm_arch_hardware_setup(void *opaque) if (r != 0) return r; - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - kvm_ops_static_call_update(); + kvm_ops_update(ops); kvm_register_perf_callbacks(ops->handle_intel_pt_intr); @@ -11712,6 +11798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; kvm->arch.guest_can_read_msr_platform_info = true; kvm->arch.enable_pmu = enable_pmu; @@ -11747,20 +11834,15 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) vcpu_put(vcpu); } -static void kvm_free_vcpus(struct kvm *kvm) +static void kvm_unload_vcpu_mmus(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; - /* - * Unpin any mmu pages first. - */ kvm_for_each_vcpu(i, vcpu, kvm) { kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } - - kvm_destroy_vcpus(kvm); } void kvm_arch_sync_events(struct kvm *kvm) @@ -11866,11 +11948,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } + kvm_unload_vcpu_mmus(kvm); static_call_cond(kvm_x86_vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_free_vcpus(kvm); + kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); @@ -12193,6 +12276,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) return true; + if (kvm_xen_has_pending_events(vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) + return true; + return false; } @@ -12290,25 +12379,6 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) } EXPORT_SYMBOL_GPL(kvm_set_rflags); -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) || - work->wakeup_all) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) - return; - - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); -} - static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); @@ -13000,6 +13070,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bf6cc25eee76..610beba35907 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -9,17 +9,25 @@ #include "x86.h" #include "xen.h" #include "hyperv.h" +#include "lapic.h" +#include <linux/eventfd.h> #include <linux/kvm_host.h> #include <linux/sched/stat.h> #include <trace/events/kvm.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/version.h> #include <xen/interface/event_channel.h> +#include <xen/interface/sched.h> #include "trace.h" +static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); +static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); + DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) @@ -102,6 +110,66 @@ out: return ret; } +void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) +{ + if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) { + struct kvm_xen_evtchn e; + + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + kvm_xen_set_evtchn(&e, vcpu->kvm); + + vcpu->arch.xen.timer_expires = 0; + atomic_set(&vcpu->arch.xen.timer_pending, 0); + } +} + +static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, + arch.xen.timer); + if (atomic_read(&vcpu->arch.xen.timer_pending)) + return HRTIMER_NORESTART; + + atomic_inc(&vcpu->arch.xen.timer_pending); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); + + return HRTIMER_NORESTART; +} + +static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) +{ + atomic_set(&vcpu->arch.xen.timer_pending, 0); + vcpu->arch.xen.timer_expires = guest_abs; + + if (delta_ns <= 0) { + xen_timer_callback(&vcpu->arch.xen.timer); + } else { + ktime_t ktime_now = ktime_get(); + hrtimer_start(&vcpu->arch.xen.timer, + ktime_add_ns(ktime_now, delta_ns), + HRTIMER_MODE_ABS_HARD); + } +} + +static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) +{ + hrtimer_cancel(&vcpu->arch.xen.timer); + vcpu->arch.xen.timer_expires = 0; + atomic_set(&vcpu->arch.xen.timer_pending, 0); +} + +static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) +{ + hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); + vcpu->arch.xen.timer.function = xen_timer_callback; +} + static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; @@ -133,27 +201,36 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; - struct gfn_to_hva_cache *ghc = &vx->runstate_cache; - struct kvm_memslots *slots = kvm_memslots(v->kvm); - bool atomic = (state == RUNSTATE_runnable); - uint64_t state_entry_time; - int __user *user_state; - uint64_t __user *user_times; + struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; + uint64_t *user_times; + unsigned long flags; + size_t user_len; + int *user_state; kvm_xen_update_runstate(v, state); - if (!vx->runstate_set) + if (!vx->runstate_cache.active) return; - if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) && - kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len)) - return; + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) + user_len = sizeof(struct vcpu_runstate_info); + else + user_len = sizeof(struct compat_vcpu_runstate_info); - /* We made sure it fits in a single page */ - BUG_ON(!ghc->memslot); + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + user_len)) { + read_unlock_irqrestore(&gpc->lock, flags); - if (atomic) - pagefault_disable(); + /* When invoked from kvm_sched_out() we cannot sleep */ + if (state == RUNSTATE_runnable) + return; + + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) + return; + + read_lock_irqsave(&gpc->lock, flags); + } /* * The only difference between 32-bit and 64-bit versions of the @@ -167,38 +244,33 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); - user_state = (int __user *)ghc->hva; - BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); - - user_times = (uint64_t __user *)(ghc->hva + - offsetof(struct compat_vcpu_runstate_info, - state_entry_time)); #ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); - - if (v->kvm->arch.xen.long_mode) - user_times = (uint64_t __user *)(ghc->hva + - offsetof(struct vcpu_runstate_info, - state_entry_time)); #endif + + user_state = gpc->khva; + + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) + user_times = gpc->khva + offsetof(struct vcpu_runstate_info, + state_entry_time); + else + user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, + state_entry_time); + /* * First write the updated state_entry_time at the appropriate * location determined by 'offset'. */ - state_entry_time = vx->runstate_entry_time; - state_entry_time |= XEN_RUNSTATE_UPDATE; - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(state_entry_time)); + sizeof(user_times[0])); BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(state_entry_time)); + sizeof(user_times[0])); - if (__put_user(state_entry_time, user_times)) - goto out; + user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; smp_wmb(); /* @@ -212,8 +284,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - if (__put_user(vx->current_runstate, user_state)) - goto out; + *user_state = vx->current_runstate; /* * Write the actual runstate times immediately after the @@ -228,42 +299,114 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times))) - goto out; + memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); smp_wmb(); /* * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's * runstate_entry_time field. */ - state_entry_time &= ~XEN_RUNSTATE_UPDATE; - __put_user(state_entry_time, user_times); + user_times[0] &= ~XEN_RUNSTATE_UPDATE; smp_wmb(); - out: - mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + read_unlock_irqrestore(&gpc->lock, flags); - if (atomic) - pagefault_enable(); + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); } -int __kvm_xen_has_interrupt(struct kvm_vcpu *v) +static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) +{ + struct kvm_lapic_irq irq = { }; + int r; + + irq.dest_id = v->vcpu_id; + irq.vector = v->arch.xen.upcall_vector; + irq.dest_mode = APIC_DEST_PHYSICAL; + irq.shorthand = APIC_DEST_NOSHORT; + irq.delivery_mode = APIC_DM_FIXED; + irq.level = 1; + + /* The fast version will always work for physical unicast */ + WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); +} + +/* + * On event channel delivery, the vcpu_info may not have been accessible. + * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which + * need to be marked into the vcpu_info (and evtchn_upcall_pending set). + * Do so now that we can sleep in the context of the vCPU to bring the + * page in, and refresh the pfn cache for it. + */ +void kvm_xen_inject_pending_events(struct kvm_vcpu *v) { unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); - bool atomic = in_atomic() || !task_is_running(current); - int err; + struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; + unsigned long flags; + + if (!evtchn_pending_sel) + return; + + /* + * Yes, this is an open-coded loop. But that's just what put_user() + * does anyway. Page it in and retry the instruction. We're just a + * little more honest about it. + */ + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { + read_unlock_irqrestore(&gpc->lock, flags); + + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) + return; + + read_lock_irqsave(&gpc->lock, flags); + } + + /* Now gpc->khva is a valid kernel address for the vcpu_info */ + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + struct vcpu_info *vi = gpc->khva; + + asm volatile(LOCK_PREFIX "orq %0, %1\n" + "notq %0\n" + LOCK_PREFIX "andq %0, %2\n" + : "=r" (evtchn_pending_sel), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel)); + WRITE_ONCE(vi->evtchn_upcall_pending, 1); + } else { + u32 evtchn_pending_sel32 = evtchn_pending_sel; + struct compat_vcpu_info *vi = gpc->khva; + + asm volatile(LOCK_PREFIX "orl %0, %1\n" + "notl %0\n" + LOCK_PREFIX "andl %0, %2\n" + : "=r" (evtchn_pending_sel32), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel32)); + WRITE_ONCE(vi->evtchn_upcall_pending, 1); + } + read_unlock_irqrestore(&gpc->lock, flags); + + /* For the per-vCPU lapic vector, deliver it as MSI. */ + if (v->arch.xen.upcall_vector) + kvm_xen_inject_vcpu_vector(v); + + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); +} + +int __kvm_xen_has_interrupt(struct kvm_vcpu *v) +{ + struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; + unsigned long flags; u8 rc = 0; /* * If the global upcall vector (HVMIRQ_callback_vector) is set and * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. */ - struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; - struct kvm_memslots *slots = kvm_memslots(v->kvm); - bool ghc_valid = slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot; - - unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != @@ -273,101 +416,35 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) BUILD_BUG_ON(sizeof(rc) != sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); - /* - * For efficiency, this mirrors the checks for using the valid - * cache in kvm_read_guest_offset_cached(), but just uses - * __get_user() instead. And falls back to the slow path. - */ - if (!evtchn_pending_sel && ghc_valid) { - /* Fast path */ - pagefault_disable(); - err = __get_user(rc, (u8 __user *)ghc->hva + offset); - pagefault_enable(); - if (!err) - return rc; - } - - /* Slow path */ + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { + read_unlock_irqrestore(&gpc->lock, flags); - /* - * This function gets called from kvm_vcpu_block() after setting the - * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately - * from a HLT. So we really mustn't sleep. If the page ended up absent - * at that point, just return 1 in order to trigger an immediate wake, - * and we'll end up getting called again from a context where we *can* - * fault in the page and wait for it. - */ - if (atomic) - return 1; + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + */ + if (in_atomic() || !task_is_running(current)) + return 1; - if (!ghc_valid) { - err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); - if (err || !ghc->memslot) { + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. */ return 0; } + read_lock_irqsave(&gpc->lock, flags); } - /* - * Now we have a valid (protected by srcu) userspace HVA in - * ghc->hva which points to the struct vcpu_info. If there - * are any bits in the in-kernel evtchn_pending_sel then - * we need to write those to the guest vcpu_info and set - * its evtchn_upcall_pending flag. If there aren't any bits - * to add, we only want to *check* evtchn_upcall_pending. - */ - if (evtchn_pending_sel) { - bool long_mode = v->kvm->arch.xen.long_mode; - - if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info))) - return 0; - - if (IS_ENABLED(CONFIG_64BIT) && long_mode) { - struct vcpu_info __user *vi = (void __user *)ghc->hva; - - /* Attempt to set the evtchn_pending_sel bits in the - * guest, and if that succeeds then clear the same - * bits in the in-kernel version. */ - asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n" - "\tnotq %0\n" - "\t" LOCK_PREFIX "andq %0, %2\n" - "2:\n" - _ASM_EXTABLE_UA(1b, 2b) - : "=r" (evtchn_pending_sel), - "+m" (vi->evtchn_pending_sel), - "+m" (v->arch.xen.evtchn_pending_sel) - : "0" (evtchn_pending_sel)); - } else { - struct compat_vcpu_info __user *vi = (void __user *)ghc->hva; - u32 evtchn_pending_sel32 = evtchn_pending_sel; - - /* Attempt to set the evtchn_pending_sel bits in the - * guest, and if that succeeds then clear the same - * bits in the in-kernel version. */ - asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n" - "\tnotl %0\n" - "\t" LOCK_PREFIX "andl %0, %2\n" - "2:\n" - _ASM_EXTABLE_UA(1b, 2b) - : "=r" (evtchn_pending_sel32), - "+m" (vi->evtchn_pending_sel), - "+m" (v->arch.xen.evtchn_pending_sel) - : "0" (evtchn_pending_sel32)); - } - rc = 1; - unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err); - - err: - user_access_end(); - - mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); - } else { - __get_user(rc, (u8 __user *)ghc->hva + offset); - } - + rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; + read_unlock_irqrestore(&gpc->lock, flags); return rc; } @@ -375,36 +452,51 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { + mutex_lock(&kvm->lock); kvm->arch.xen.long_mode = !!data->u.long_mode; + mutex_unlock(&kvm->lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: + mutex_lock(&kvm->lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); + mutex_unlock(&kvm->lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { + mutex_lock(&kvm->lock); kvm->arch.xen.upcall_vector = data->u.vector; + mutex_unlock(&kvm->lock); r = 0; } break; + case KVM_XEN_ATTR_TYPE_EVTCHN: + r = kvm_xen_setattr_evtchn(kvm, data); + break; + + case KVM_XEN_ATTR_TYPE_XEN_VERSION: + mutex_lock(&kvm->lock); + kvm->arch.xen.xen_version = data->u.xen_version; + mutex_unlock(&kvm->lock); + r = 0; + break; + default: break; } - mutex_unlock(&kvm->lock); return r; } @@ -433,6 +525,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_XEN_VERSION: + data->u.xen_version = kvm->arch.xen.xen_version; + r = 0; + break; + default: break; } @@ -457,48 +554,34 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - vcpu->arch.xen.vcpu_info_set = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - /* It must fit within a single page */ - if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) { - r = -EINVAL; - break; - } - - r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache, - data->u.gpa, + NULL, KVM_HOST_USES_PFN, data->u.gpa, sizeof(struct vcpu_info)); - if (!r) { - vcpu->arch.xen.vcpu_info_set = true; + if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } + break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - vcpu->arch.xen.vcpu_time_info_set = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - /* It must fit within a single page */ - if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) { - r = -EINVAL; - break; - } - - r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache, - data->u.gpa, + NULL, KVM_HOST_USES_PFN, data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); - if (!r) { - vcpu->arch.xen.vcpu_time_info_set = true; + if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: @@ -507,24 +590,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } if (data->u.gpa == GPA_INVALID) { - vcpu->arch.xen.runstate_set = false; + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); r = 0; break; } - /* It must fit within a single page */ - if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) { - r = -EINVAL; - break; - } - - r = kvm_gfn_to_hva_cache_init(vcpu->kvm, + r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - data->u.gpa, + NULL, KVM_HOST_USES_PFN, data->u.gpa, sizeof(struct vcpu_runstate_info)); - if (!r) { - vcpu->arch.xen.runstate_set = true; - } break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: @@ -622,6 +697,46 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = 0; break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: + if (data->u.vcpu_id >= KVM_MAX_VCPUS) + r = -EINVAL; + else { + vcpu->arch.xen.vcpu_id = data->u.vcpu_id; + r = 0; + } + break; + + case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + if (data->u.timer.port) { + if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { + r = -EINVAL; + break; + } + vcpu->arch.xen.timer_virq = data->u.timer.port; + kvm_xen_init_timer(vcpu); + + /* Restart the timer if it's set */ + if (data->u.timer.expires_ns) + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, + data->u.timer.expires_ns - + get_kvmclock_ns(vcpu->kvm)); + } else if (kvm_xen_timer_enabled(vcpu)) { + kvm_xen_stop_timer(vcpu); + vcpu->arch.xen.timer_virq = 0; + } + + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: + if (data->u.vector && data->u.vector < 0x10) + r = -EINVAL; + else { + vcpu->arch.xen.upcall_vector = data->u.vector; + r = 0; + } + break; + default: break; } @@ -639,7 +754,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: - if (vcpu->arch.xen.vcpu_info_set) + if (vcpu->arch.xen.vcpu_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else data->u.gpa = GPA_INVALID; @@ -647,7 +762,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: - if (vcpu->arch.xen.vcpu_time_info_set) + if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; else data->u.gpa = GPA_INVALID; @@ -659,7 +774,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = -EOPNOTSUPP; break; } - if (vcpu->arch.xen.runstate_set) { + if (vcpu->arch.xen.runstate_cache.active) { data->u.gpa = vcpu->arch.xen.runstate_cache.gpa; r = 0; } @@ -697,6 +812,23 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = -EINVAL; break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: + data->u.vcpu_id = vcpu->arch.xen.vcpu_id; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_TIMER: + data->u.timer.port = vcpu->arch.xen.timer_virq; + data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; + r = 0; + break; + + case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: + data->u.vector = vcpu->arch.xen.upcall_vector; + r = 0; + break; + default: break; } @@ -777,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) { - if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) + /* Only some feature flags need to be *enabled* by userspace */ + u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | + KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + + if (xhc->flags & ~permitted_flags) return -EINVAL; /* @@ -802,18 +938,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } -void kvm_xen_init_vm(struct kvm *kvm) -{ -} - -void kvm_xen_destroy_vm(struct kvm *kvm) -{ - kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); - - if (kvm->arch.xen_hvm_config.msr) - static_branch_slow_dec_deferred(&kvm_xen_enabled); -} - static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { kvm_rax_write(vcpu, result); @@ -830,10 +954,268 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); } +static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, + evtchn_port_t *ports) +{ + struct kvm *kvm = vcpu->kvm; + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + unsigned long *pending_bits; + unsigned long flags; + bool ret = true; + int idx, i; + + read_lock_irqsave(&gpc->lock, flags); + idx = srcu_read_lock(&kvm->srcu); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + goto out_rcu; + + ret = false; + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + } else { + struct compat_shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + } + + for (i = 0; i < nr_ports; i++) { + if (test_bit(ports[i], pending_bits)) { + ret = true; + break; + } + } + + out_rcu: + srcu_read_unlock(&kvm->srcu, idx); + read_unlock_irqrestore(&gpc->lock, flags); + + return ret; +} + +static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, + u64 param, u64 *r) +{ + int idx, i; + struct sched_poll sched_poll; + evtchn_port_t port, *ports; + gpa_t gpa; + + if (!longmode || !lapic_in_kernel(vcpu) || + !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) + return false; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, + sizeof(sched_poll))) { + *r = -EFAULT; + return true; + } + + if (unlikely(sched_poll.nr_ports > 1)) { + /* Xen (unofficially) limits number of pollers to 128 */ + if (sched_poll.nr_ports > 128) { + *r = -EINVAL; + return true; + } + + ports = kmalloc_array(sched_poll.nr_ports, + sizeof(*ports), GFP_KERNEL); + if (!ports) { + *r = -ENOMEM; + return true; + } + } else + ports = &port; + + for (i = 0; i < sched_poll.nr_ports; i++) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, + (gva_t)(sched_poll.ports + i), + NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, + &ports[i], sizeof(port))) { + *r = -EFAULT; + goto out; + } + } + + if (sched_poll.nr_ports == 1) + vcpu->arch.xen.poll_evtchn = port; + else + vcpu->arch.xen.poll_evtchn = -1; + + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + + if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + + if (sched_poll.timeout) + mod_timer(&vcpu->arch.xen.poll_timer, + jiffies + nsecs_to_jiffies(sched_poll.timeout)); + + kvm_vcpu_halt(vcpu); + + if (sched_poll.timeout) + del_timer(&vcpu->arch.xen.poll_timer); + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + } + + vcpu->arch.xen.poll_evtchn = 0; + *r = 0; +out: + /* Really, this is only needed in case of timeout */ + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + + if (unlikely(sched_poll.nr_ports > 1)) + kfree(ports); + return true; +} + +static void cancel_evtchn_poll(struct timer_list *t) +{ + struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); +} + +static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode, + int cmd, u64 param, u64 *r) +{ + switch (cmd) { + case SCHEDOP_poll: + if (kvm_xen_schedop_poll(vcpu, longmode, param, r)) + return true; + fallthrough; + case SCHEDOP_yield: + kvm_vcpu_on_spin(vcpu, true); + *r = 0; + return true; + default: + break; + } + + return false; +} + +struct compat_vcpu_set_singleshot_timer { + uint64_t timeout_abs_ns; + uint32_t flags; +} __attribute__((packed)); + +static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, + int vcpu_id, u64 param, u64 *r) +{ + struct vcpu_set_singleshot_timer oneshot; + s64 delta; + gpa_t gpa; + int idx; + + if (!kvm_xen_timer_enabled(vcpu)) + return false; + + switch (cmd) { + case VCPUOP_set_singleshot_timer: + if (vcpu->arch.xen.vcpu_id != vcpu_id) { + *r = -EINVAL; + return true; + } + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + /* + * The only difference for 32-bit compat is the 4 bytes of + * padding after the interesting part of the structure. So + * for a faithful emulation of Xen we have to *try* to copy + * the padding and return -EFAULT if we can't. Otherwise we + * might as well just have copied the 12-byte 32-bit struct. + */ + BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != + offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != + sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns)); + BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) != + offsetof(struct vcpu_set_singleshot_timer, flags)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != + sizeof_field(struct vcpu_set_singleshot_timer, flags)); + + if (!gpa || + kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) : + sizeof(struct compat_vcpu_set_singleshot_timer))) { + *r = -EFAULT; + return true; + } + + delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); + if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) { + *r = -ETIME; + return true; + } + + kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); + *r = 0; + return true; + + case VCPUOP_stop_singleshot_timer: + if (vcpu->arch.xen.vcpu_id != vcpu_id) { + *r = -EINVAL; + return true; + } + kvm_xen_stop_timer(vcpu); + *r = 0; + return true; + } + + return false; +} + +static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, + u64 *r) +{ + if (!kvm_xen_timer_enabled(vcpu)) + return false; + + if (timeout) { + uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); + int64_t delta = timeout - guest_now; + + /* Xen has a 'Linux workaround' in do_set_timer_op() which + * checks for negative absolute timeout values (caused by + * integer overflow), and for values about 13 days in the + * future (2^50ns) which would be caused by jiffies + * overflow. For those cases, it sets the timeout 100ms in + * the future (not *too* soon, since if a guest really did + * set a long timeout on purpose we don't want to keep + * churning CPU time by waking it up). + */ + if (unlikely((int64_t)timeout < 0 || + (delta > 0 && (uint32_t) (delta >> 50) != 0))) { + delta = 100 * NSEC_PER_MSEC; + timeout = guest_now + delta; + } + + kvm_xen_start_timer(vcpu, timeout, delta); + } else { + kvm_xen_stop_timer(vcpu); + } + + *r = 0; + return true; +} + int kvm_xen_hypercall(struct kvm_vcpu *vcpu) { bool longmode; - u64 input, params[6]; + u64 input, params[6], r = -ENOSYS; + bool handled = false; input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -864,6 +1246,40 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) trace_kvm_xen_hypercall(input, params[0], params[1], params[2], params[3], params[4], params[5]); + switch (input) { + case __HYPERVISOR_xen_version: + if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { + r = vcpu->kvm->arch.xen.xen_version; + handled = true; + } + break; + case __HYPERVISOR_event_channel_op: + if (params[0] == EVTCHNOP_send) + handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r); + break; + case __HYPERVISOR_sched_op: + handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0], + params[1], &r); + break; + case __HYPERVISOR_vcpu_op: + handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1], + params[2], &r); + break; + case __HYPERVISOR_set_timer_op: { + u64 timeout = params[0]; + /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */ + if (!longmode) + timeout |= params[1] << 32; + handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r); + break; + } + default: + break; + } + + if (handled) + return kvm_xen_hypercall_set_result(vcpu, r); + vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; @@ -890,14 +1306,28 @@ static inline int max_evtchn_port(struct kvm *kvm) return COMPAT_EVTCHN_2L_NR_CHANNELS; } +static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) +{ + int poll_evtchn = vcpu->arch.xen.poll_evtchn; + + if ((poll_evtchn == port || poll_evtchn == -1) && + test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); + } +} + /* - * This follows the kvm_set_irq() API, so it returns: + * The return value from this function is propagated to kvm_set_irq() API, + * so it returns: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to + * + * It is also called directly from kvm_arch_set_irq_inatomic(), where the + * only check on its return value is a comparison with -EWOULDBLOCK'. */ -int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm) +int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct kvm_vcpu *vcpu; @@ -905,23 +1335,29 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, unsigned long flags; int port_word_bit; bool kick_vcpu = false; - int idx; - int rc; + int vcpu_idx, idx, rc; - vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); - if (!vcpu) - return -1; + vcpu_idx = READ_ONCE(xe->vcpu_idx); + if (vcpu_idx >= 0) + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + else { + vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); + if (!vcpu) + return -EINVAL; + WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + } - if (!vcpu->arch.xen.vcpu_info_set) - return -1; + if (!vcpu->arch.xen.vcpu_info_cache.active) + return -EINVAL; - if (e->xen_evtchn.port >= max_evtchn_port(kvm)) - return -1; + if (xe->port >= max_evtchn_port(kvm)) + return -EINVAL; rc = -EWOULDBLOCK; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); + + read_lock_irqsave(&gpc->lock, flags); if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; @@ -929,12 +1365,12 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, struct shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; - port_word_bit = e->xen_evtchn.port / 64; + port_word_bit = xe->port / 64; } else { struct compat_shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; - port_word_bit = e->xen_evtchn.port / 32; + port_word_bit = xe->port / 32; } /* @@ -944,39 +1380,68 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, * already set, then we kick the vCPU in question to write to the * *real* evtchn_pending_sel in its own guest vcpu_info struct. */ - if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) { + if (test_and_set_bit(xe->port, pending_bits)) { rc = 0; /* It was already raised */ - } else if (test_bit(e->xen_evtchn.port, mask_bits)) { - rc = -1; /* Masked */ + } else if (test_bit(xe->port, mask_bits)) { + rc = -ENOTCONN; /* Masked */ + kvm_xen_check_poller(vcpu, xe->port); } else { - rc = 1; /* Delivered. But was the vCPU waking already? */ - if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) - kick_vcpu = true; + rc = 1; /* Delivered to the bitmap in shared_info. */ + /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ + read_unlock_irqrestore(&gpc->lock, flags); + gpc = &vcpu->arch.xen.vcpu_info_cache; + + read_lock_irqsave(&gpc->lock, flags); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + /* + * Could not access the vcpu_info. Set the bit in-kernel + * and prod the vCPU to deliver it for itself. + */ + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + goto out_rcu; + } + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct vcpu_info *vcpu_info = gpc->khva; + if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); + kick_vcpu = true; + } + } else { + struct compat_vcpu_info *vcpu_info = gpc->khva; + if (!test_and_set_bit(port_word_bit, + (unsigned long *)&vcpu_info->evtchn_pending_sel)) { + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); + kick_vcpu = true; + } + } + + /* For the per-vCPU lapic vector, deliver it as MSI. */ + if (kick_vcpu && vcpu->arch.xen.upcall_vector) { + kvm_xen_inject_vcpu_vector(vcpu); + kick_vcpu = false; + } } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); if (kick_vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } return rc; } -/* This is the version called from kvm_set_irq() as the .set function */ -static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, - int irq_source_id, int level, bool line_status) +static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) { bool mm_borrowed = false; int rc; - if (!level) - return -1; - - rc = kvm_xen_set_evtchn_fast(e, kvm); + rc = kvm_xen_set_evtchn_fast(xe, kvm); if (rc != -EWOULDBLOCK) return rc; @@ -1020,7 +1485,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; int idx; - rc = kvm_xen_set_evtchn_fast(e, kvm); + rc = kvm_xen_set_evtchn_fast(xe, kvm); if (rc != -EWOULDBLOCK) break; @@ -1037,11 +1502,27 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm return rc; } +/* This is the version called from kvm_set_irq() as the .set function */ +static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + if (!level) + return -EINVAL; + + return kvm_xen_set_evtchn(&e->xen_evtchn, kvm); +} + +/* + * Set up an event channel interrupt from the KVM IRQ routing table. + * Used for e.g. PIRQ from passed through physical devices. + */ int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { + struct kvm_vcpu *vcpu; + if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) return -EINVAL; @@ -1049,10 +1530,328 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) return -EINVAL; + /* + * Xen gives us interesting mappings from vCPU index to APIC ID, + * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs + * to find it. Do that once at setup time, instead of every time. + * But beware that on live update / live migration, the routing + * table might be reinstated before the vCPU threads have finished + * recreating their vCPUs. + */ + vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); + if (vcpu) + e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + else + e->xen_evtchn.vcpu_idx = -1; + e->xen_evtchn.port = ue->u.xen_evtchn.port; - e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu; e->xen_evtchn.priority = ue->u.xen_evtchn.priority; e->set = evtchn_set_fn; return 0; } + +/* + * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl. + */ +int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe) +{ + struct kvm_xen_evtchn e; + int ret; + + if (!uxe->port || uxe->port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e.port = uxe->port; + e.vcpu_id = uxe->vcpu; + e.vcpu_idx = -1; + e.priority = uxe->priority; + + ret = kvm_xen_set_evtchn(&e, kvm); + + /* + * None of that 'return 1 if it actually got delivered' nonsense. + * We don't care if it was masked (-ENOTCONN) either. + */ + if (ret > 0 || ret == -ENOTCONN) + ret = 0; + + return ret; +} + +/* + * Support for *outbound* event channel events via the EVTCHNOP_send hypercall. + */ +struct evtchnfd { + u32 send_port; + u32 type; + union { + struct kvm_xen_evtchn port; + struct { + u32 port; /* zero */ + struct eventfd_ctx *ctx; + } eventfd; + } deliver; +}; + +/* + * Update target vCPU or priority for a registered sending channel. + */ +static int kvm_xen_eventfd_update(struct kvm *kvm, + struct kvm_xen_hvm_attr *data) +{ + u32 port = data->u.evtchn.send_port; + struct evtchnfd *evtchnfd; + + if (!port || port >= max_evtchn_port(kvm)) + return -EINVAL; + + mutex_lock(&kvm->lock); + evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); + mutex_unlock(&kvm->lock); + + if (!evtchnfd) + return -ENOENT; + + /* For an UPDATE, nothing may change except the priority/vcpu */ + if (evtchnfd->type != data->u.evtchn.type) + return -EINVAL; + + /* + * Port cannot change, and if it's zero that was an eventfd + * which can't be changed either. + */ + if (!evtchnfd->deliver.port.port || + evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + mutex_lock(&kvm->lock); + evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; + if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { + evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; + evtchnfd->deliver.port.vcpu_idx = -1; + } + mutex_unlock(&kvm->lock); + return 0; +} + +/* + * Configure the target (eventfd or local port delivery) for sending on + * a given event channel. + */ +static int kvm_xen_eventfd_assign(struct kvm *kvm, + struct kvm_xen_hvm_attr *data) +{ + u32 port = data->u.evtchn.send_port; + struct eventfd_ctx *eventfd = NULL; + struct evtchnfd *evtchnfd = NULL; + int ret = -EINVAL; + + if (!port || port >= max_evtchn_port(kvm)) + return -EINVAL; + + evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL); + if (!evtchnfd) + return -ENOMEM; + + switch(data->u.evtchn.type) { + case EVTCHNSTAT_ipi: + /* IPI must map back to the same port# */ + if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) + goto out; /* -EINVAL */ + break; + + case EVTCHNSTAT_interdomain: + if (data->u.evtchn.deliver.port.port) { + if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) + goto out; /* -EINVAL */ + } else { + eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto out; + } + } + break; + + case EVTCHNSTAT_virq: + case EVTCHNSTAT_closed: + case EVTCHNSTAT_unbound: + case EVTCHNSTAT_pirq: + default: /* Unknown event channel type */ + goto out; /* -EINVAL */ + } + + evtchnfd->send_port = data->u.evtchn.send_port; + evtchnfd->type = data->u.evtchn.type; + if (eventfd) { + evtchnfd->deliver.eventfd.ctx = eventfd; + } else { + /* We only support 2 level event channels for now */ + if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + goto out; /* -EINVAL; */ + + evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port; + evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; + evtchnfd->deliver.port.vcpu_idx = -1; + evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; + } + + mutex_lock(&kvm->lock); + ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, + GFP_KERNEL); + mutex_unlock(&kvm->lock); + if (ret >= 0) + return 0; + + if (ret == -ENOSPC) + ret = -EEXIST; +out: + if (eventfd) + eventfd_ctx_put(eventfd); + kfree(evtchnfd); + return ret; +} + +static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) +{ + struct evtchnfd *evtchnfd; + + mutex_lock(&kvm->lock); + evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); + mutex_unlock(&kvm->lock); + + if (!evtchnfd) + return -ENOENT; + + if (kvm) + synchronize_srcu(&kvm->srcu); + if (!evtchnfd->deliver.port.port) + eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); + kfree(evtchnfd); + return 0; +} + +static int kvm_xen_eventfd_reset(struct kvm *kvm) +{ + struct evtchnfd *evtchnfd; + int i; + + mutex_lock(&kvm->lock); + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); + synchronize_srcu(&kvm->srcu); + if (!evtchnfd->deliver.port.port) + eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); + kfree(evtchnfd); + } + mutex_unlock(&kvm->lock); + + return 0; +} + +static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data) +{ + u32 port = data->u.evtchn.send_port; + + if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET) + return kvm_xen_eventfd_reset(kvm); + + if (!port || port >= max_evtchn_port(kvm)) + return -EINVAL; + + if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN) + return kvm_xen_eventfd_deassign(kvm, port); + if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE) + return kvm_xen_eventfd_update(kvm, data); + if (data->u.evtchn.flags) + return -EINVAL; + + return kvm_xen_eventfd_assign(kvm, data); +} + +static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) +{ + struct evtchnfd *evtchnfd; + struct evtchn_send send; + gpa_t gpa; + int idx; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) { + *r = -EFAULT; + return true; + } + + /* The evtchn_ports idr is protected by vcpu->kvm->srcu */ + evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port); + if (!evtchnfd) + return false; + + if (evtchnfd->deliver.port.port) { + int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm); + if (ret < 0 && ret != -ENOTCONN) + return false; + } else { + eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); + } + + *r = 0; + return true; +} + +void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; + vcpu->arch.xen.poll_evtchn = 0; + timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); +} + +void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) +{ + if (kvm_xen_timer_enabled(vcpu)) + kvm_xen_stop_timer(vcpu); + + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache); + kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); + del_timer_sync(&vcpu->arch.xen.poll_timer); +} + +void kvm_xen_init_vm(struct kvm *kvm) +{ + idr_init(&kvm->arch.xen.evtchn_ports); +} + +void kvm_xen_destroy_vm(struct kvm *kvm) +{ + struct evtchnfd *evtchnfd; + int i; + + kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + if (!evtchnfd->deliver.port.port) + eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); + kfree(evtchnfd); + } + idr_destroy(&kvm->arch.xen.evtchn_ports); + + if (kvm->arch.xen_hvm_config.msr) + static_branch_slow_dec_deferred(&kvm_xen_enabled); +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index adbcc9ed59db..ee5c4ae0755c 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -15,16 +15,19 @@ extern struct static_key_false_deferred kvm_xen_enabled; int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); +void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu); int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); +int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); - -int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, +void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu); +void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu); +int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm); int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, @@ -46,11 +49,33 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_xen_enabled.key) && - vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector) + vcpu->arch.xen.vcpu_info_cache.active && + vcpu->kvm->arch.xen.upcall_vector) return __kvm_xen_has_interrupt(vcpu); return 0; } + +static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.evtchn_pending_sel; +} + +static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) +{ + return !!vcpu->arch.xen.timer_virq; +} + +static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu) +{ + if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu)) + return atomic_read(&vcpu->arch.xen.timer_pending); + + return 0; +} + +void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu); #else static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) { @@ -65,6 +90,14 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } +static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) +{ +} + +static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) +{ +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return false; @@ -79,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu) { return 0; } + +static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu) +{ +} + +static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu) +{ + return false; +} + +static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) +{ +} + +static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} #endif int kvm_xen_hypercall(struct kvm_vcpu *vcpu); |