diff options
Diffstat (limited to 'arch')
53 files changed, 1358 insertions, 412 deletions
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index f0e66577ce05..127e2dd2e21c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -44,7 +44,9 @@ #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS #endif -#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SLEEP \ + KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); @@ -233,8 +235,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu); -void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu); int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); @@ -291,20 +291,12 @@ static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -static inline int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} + +int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); +int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); +int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 5e3c673fa3f4..5db2d4c6a55f 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -203,6 +203,14 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff #define VGIC_LEVEL_INFO_LINE_LEVEL 0 +/* Device Control API on vcpu fd */ +#define KVM_ARM_VCPU_PMU_V3_CTRL 0 +#define KVM_ARM_VCPU_PMU_V3_IRQ 0 +#define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_TIMER_CTRL 1 +#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 +#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 + #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 #define KVM_DEV_ARM_ITS_SAVE_TABLES 1 #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index fa6182a40941..1e0784ebbfd6 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -301,3 +301,54 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { return -EINVAL; } + +int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_set_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_get_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_has_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index f86a9aaef462..54442e375354 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -72,6 +72,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) trace_kvm_wfx(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; kvm_vcpu_block(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index 624a510d31df..ebd2dd46adf7 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -237,8 +237,10 @@ void __hyp_text __noreturn __hyp_panic(int cause) vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + __timer_save_state(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); + __banked_restore_state(host_ctxt); __sysreg_restore_state(host_ctxt); } diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index 570ed4a9c261..5386528665b5 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -104,7 +104,6 @@ __do_hyp_init: @ - Write permission implies XN: disabled @ - Instruction cache: enabled @ - Data/Unified cache: enabled - @ - Memory alignment checks: enabled @ - MMU: enabled (this code must be run from an identity mapping) mrc p15, 4, r0, c1, c0, 0 @ HSCR ldr r2, =HSCTLR_MASK @@ -112,8 +111,8 @@ __do_hyp_init: mrc p15, 0, r1, c1, c0, 0 @ SCTLR ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) and r1, r1, r2 - ARM( ldr r2, =(HSCTLR_M | HSCTLR_A) ) - THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) + ARM( ldr r2, =(HSCTLR_M) ) + THUMB( ldr r2, =(HSCTLR_M | HSCTLR_TE) ) orr r1, r1, r2 orr r0, r0, r1 mcr p15, 4, r0, c1, c0, 0 @ HSCR diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index 1da8b2d14550..5ed0c3ee33d6 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -37,16 +37,6 @@ static struct kvm_regs cortexa_regs_reset = { .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, }; -static const struct kvm_irq_level cortexa_ptimer_irq = { - { .irq = 30 }, - .level = 1, -}; - -static const struct kvm_irq_level cortexa_vtimer_irq = { - { .irq = 27 }, - .level = 1, -}; - /******************************************************************************* * Exported reset function @@ -62,16 +52,12 @@ static const struct kvm_irq_level cortexa_vtimer_irq = { int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct kvm_regs *reset_regs; - const struct kvm_irq_level *cpu_vtimer_irq; - const struct kvm_irq_level *cpu_ptimer_irq; switch (vcpu->arch.target) { case KVM_ARM_TARGET_CORTEX_A7: case KVM_ARM_TARGET_CORTEX_A15: reset_regs = &cortexa_regs_reset; vcpu->arch.midr = read_cpuid_id(); - cpu_vtimer_irq = &cortexa_vtimer_irq; - cpu_ptimer_irq = &cortexa_ptimer_irq; break; default: return -ENODEV; @@ -84,5 +70,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_reset_coprocs(vcpu); /* Reset arch_timer context */ - return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); + return kvm_timer_vcpu_reset(vcpu); } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3dcd7ec69bca..6252365b0c96 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -480,6 +480,17 @@ config CAVIUM_ERRATUM_27456 If unsure, say Y. +config CAVIUM_ERRATUM_30115 + bool "Cavium erratum 30115: Guest may disable interrupts in host" + default y + help + On ThunderX T88 pass 1.x through 2.2, T81 pass 1.0 through + 1.2, and T83 Pass 1.0, KVM guest execution may disable + interrupts in host. Trapping both GICv3 group-0 and group-1 + accesses sidesteps the issue. + + If unsure, say Y. + config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 1a98bc8602a2..8cef47fa2218 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -89,7 +89,7 @@ static inline void gic_write_ctlr(u32 val) static inline void gic_write_grpen1(u32 val) { - write_sysreg_s(val, SYS_ICC_GRPEN1_EL1); + write_sysreg_s(val, SYS_ICC_IGRPEN1_EL1); isb(); } diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b3aab8a17868..8d2272c6822c 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -38,7 +38,8 @@ #define ARM64_WORKAROUND_REPEAT_TLBI 17 #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 #define ARM64_WORKAROUND_858921 19 +#define ARM64_WORKAROUND_CAVIUM_30115 20 -#define ARM64_NCAPS 20 +#define ARM64_NCAPS 21 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 0984d1b3a8f2..235e77d98261 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -86,6 +86,7 @@ #define CAVIUM_CPU_PART_THUNDERX 0x0A1 #define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 +#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 #define BRCM_CPU_PART_VULCAN 0x516 @@ -96,6 +97,7 @@ #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) +#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 85997c0e5443..e7d8e281ff62 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -19,6 +19,7 @@ #define __ASM_ESR_H #include <asm/memory.h> +#include <asm/sysreg.h> #define ESR_ELx_EC_UNKNOWN (0x00) #define ESR_ELx_EC_WFx (0x01) @@ -181,6 +182,29 @@ #define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \ ESR_ELx_SYS64_ISS_DIR_READ) +#define esr_sys64_to_sysreg(e) \ + sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >> \ + ESR_ELx_SYS64_ISS_OP0_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + +#define esr_cp15_to_sysreg(e) \ + sys_reg(3, \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + #ifndef __ASSEMBLY__ #include <asm/types.h> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1f252a95bc02..d68630007b14 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -42,7 +42,9 @@ #define KVM_VCPU_MAX_FEATURES 4 -#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SLEEP \ + KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -334,8 +336,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu); -void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu); u64 __kvm_call_hyp(void *hypfn, ...); #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index b18e852d27e8..4572a9b560fa 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -127,6 +127,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); +int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); void __timer_save_state(struct kvm_vcpu *vcpu); void __timer_restore_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 15c142ce991c..16e44fa9b3b6 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -180,14 +180,31 @@ #define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0) +#define SYS_ICC_IAR0_EL1 sys_reg(3, 0, 12, 8, 0) +#define SYS_ICC_EOIR0_EL1 sys_reg(3, 0, 12, 8, 1) +#define SYS_ICC_HPPIR0_EL1 sys_reg(3, 0, 12, 8, 2) +#define SYS_ICC_BPR0_EL1 sys_reg(3, 0, 12, 8, 3) +#define SYS_ICC_AP0Rn_EL1(n) sys_reg(3, 0, 12, 8, 4 | n) +#define SYS_ICC_AP0R0_EL1 SYS_ICC_AP0Rn_EL1(0) +#define SYS_ICC_AP0R1_EL1 SYS_ICC_AP0Rn_EL1(1) +#define SYS_ICC_AP0R2_EL1 SYS_ICC_AP0Rn_EL1(2) +#define SYS_ICC_AP0R3_EL1 SYS_ICC_AP0Rn_EL1(3) +#define SYS_ICC_AP1Rn_EL1(n) sys_reg(3, 0, 12, 9, n) +#define SYS_ICC_AP1R0_EL1 SYS_ICC_AP1Rn_EL1(0) +#define SYS_ICC_AP1R1_EL1 SYS_ICC_AP1Rn_EL1(1) +#define SYS_ICC_AP1R2_EL1 SYS_ICC_AP1Rn_EL1(2) +#define SYS_ICC_AP1R3_EL1 SYS_ICC_AP1Rn_EL1(3) #define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) +#define SYS_ICC_RPR_EL1 sys_reg(3, 0, 12, 11, 3) #define SYS_ICC_SGI1R_EL1 sys_reg(3, 0, 12, 11, 5) #define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) #define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) +#define SYS_ICC_HPPIR1_EL1 sys_reg(3, 0, 12, 12, 2) #define SYS_ICC_BPR1_EL1 sys_reg(3, 0, 12, 12, 3) #define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) #define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) -#define SYS_ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define SYS_ICC_IGRPEN0_EL1 sys_reg(3, 0, 12, 12, 6) +#define SYS_ICC_IGRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) #define SYS_CONTEXTIDR_EL1 sys_reg(3, 0, 13, 0, 1) #define SYS_TPIDR_EL1 sys_reg(3, 0, 13, 0, 4) @@ -286,6 +303,10 @@ #define SCTLR_ELx_A (1 << 1) #define SCTLR_ELx_M 1 +#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ + (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \ + (1 << 29)) + #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ SCTLR_ELx_SA | SCTLR_ELx_I) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 70eea2ecc663..9f3ca24bbcc6 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -232,6 +232,9 @@ struct kvm_arch_memory_slot { #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_TIMER_CTRL 1 +#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 +#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 2ed2a7657711..0e27f86ee709 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -133,6 +133,27 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00), }, #endif +#ifdef CONFIG_CAVIUM_ERRATUM_30115 + { + /* Cavium ThunderX, T88 pass 1.x - 2.2 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 2), + }, + { + /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x02), + }, + { + /* Cavium ThunderX, T83 pass 1.0 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX_83XX, 0x00, 0x00), + }, +#endif { .desc = "Mismatched cache line size", .capability = ARM64_MISMATCHED_CACHE_LINE_SIZE, diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index b37446a8ffdb..5c7f657dd207 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -390,6 +390,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_set_attr(vcpu, attr); + break; default: ret = -ENXIO; break; @@ -407,6 +410,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_get_attr(vcpu, attr); + break; default: ret = -ENXIO; break; @@ -424,6 +430,9 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_has_attr(vcpu, attr); + break; default: ret = -ENXIO; break; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fa1b18e364fc..17d8a1677a0b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -89,6 +89,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; kvm_vcpu_block(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 839425c24b1c..3f9615582377 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -106,10 +106,13 @@ __do_hyp_init: tlbi alle2 dsb sy - mrs x4, sctlr_el2 - and x4, x4, #SCTLR_ELx_EE // preserve endianness of EL2 - ldr x5, =SCTLR_ELx_FLAGS - orr x4, x4, x5 + /* + * Preserve all the RES1 bits while setting the default flags, + * as well as the EE bit on BE. Drop the A flag since the compiler + * is allowed to generate unaligned accesses. + */ + ldr x4, =(SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) +CPU_BE( orr x4, x4, #SCTLR_ELx_EE) msr sctlr_el2, x4 isb diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index aede1658aeda..945e79c641c4 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -350,6 +350,20 @@ again: } } + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + exit_code == ARM_EXCEPTION_TRAP && + (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || + kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { + int ret = __vgic_v3_perform_cpuif_access(vcpu); + + if (ret == 1) { + __skip_instr(vcpu); + goto again; + } + + /* 0 falls through to be handled out of EL2 */ + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); @@ -422,6 +436,7 @@ void __hyp_text __noreturn __hyp_panic(void) vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + __timer_save_state(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); __sysreg_restore_host_state(host_ctxt); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 561badf93de8..3256b9228e75 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -46,16 +46,6 @@ static const struct kvm_regs default_regs_reset32 = { COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), }; -static const struct kvm_irq_level default_ptimer_irq = { - .irq = 30, - .level = 1, -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, -}; - static bool cpu_has_32bit_el1(void) { u64 pfr0; @@ -108,8 +98,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { - const struct kvm_irq_level *cpu_vtimer_irq; - const struct kvm_irq_level *cpu_ptimer_irq; const struct kvm_regs *cpu_reset; switch (vcpu->arch.target) { @@ -122,8 +110,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) cpu_reset = &default_regs_reset; } - cpu_vtimer_irq = &default_vtimer_irq; - cpu_ptimer_irq = &default_ptimer_irq; break; } @@ -137,5 +123,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_reset(vcpu); /* Reset timer */ - return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); + return kvm_timer_vcpu_reset(vcpu); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0fe27024a2e1..77862881ae86 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -56,7 +56,8 @@ */ static bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) + struct sys_reg_params *params, + const struct sys_reg_desc *r) { WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); print_sys_reg_instr(params); @@ -64,6 +65,16 @@ static bool read_from_write_only(struct kvm_vcpu *vcpu, return false; } +static bool write_to_read_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n"); + print_sys_reg_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ static u32 cache_levels; @@ -93,7 +104,7 @@ static bool access_dcsw(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); kvm_set_way_flush(vcpu); return true; @@ -135,7 +146,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); vgic_v3_dispatch_sgi(vcpu, p->regval); @@ -773,7 +784,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return trap_raz_wi(vcpu, p, r); if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); if (pmu_write_swinc_el0_disabled(vcpu)) return false; @@ -953,7 +964,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, + { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, + { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 79f37e37d367..116786d2e8e8 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -65,8 +65,8 @@ static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. * The vgic_set_vmcr() will convert to ICH_VMCR layout. */ - vmcr.ctlr = val & ICC_CTLR_EL1_CBPR_MASK; - vmcr.ctlr |= val & ICC_CTLR_EL1_EOImode_MASK; + vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT; + vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT; vgic_set_vmcr(vcpu, &vmcr); } else { val = 0; @@ -83,8 +83,8 @@ static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. * Extract it directly using ICC_CTLR_EL1 reg definitions. */ - val |= vmcr.ctlr & ICC_CTLR_EL1_CBPR_MASK; - val |= vmcr.ctlr & ICC_CTLR_EL1_EOImode_MASK; + val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK; + val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; p->regval = val; } @@ -135,7 +135,7 @@ static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, p->regval = 0; vgic_get_vmcr(vcpu, &vmcr); - if (!((vmcr.ctlr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT)) { + if (!vmcr.cbpr) { if (p->is_write) { vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >> ICC_BPR1_EL1_SHIFT; @@ -268,36 +268,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { - /* ICC_PMR_EL1 */ - { Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr }, - /* ICC_BPR0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 }, - /* ICC_AP0R0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r }, - /* ICC_AP0R1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r }, - /* ICC_AP0R2_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r }, - /* ICC_AP0R3_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r }, - /* ICC_AP1R0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r }, - /* ICC_AP1R1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r }, - /* ICC_AP1R2_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r }, - /* ICC_AP1R3_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r }, - /* ICC_BPR1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 }, - /* ICC_CTLR_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr }, - /* ICC_SRE_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre }, - /* ICC_IGRPEN0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 }, - /* ICC_GRPEN1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 }, + { SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr }, + { SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 }, + { SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr }, + { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 }, }; int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index a563759fd142..6a0d7040d882 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -1094,7 +1094,7 @@ static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu, struct mm_struct *mm; int i; - if (likely(!vcpu->requests)) + if (likely(!kvm_request_pending(vcpu))) return; if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 71d8856ade64..74805035edc8 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -2337,7 +2337,7 @@ static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu) int ret = 0; int i; - if (!vcpu->requests) + if (!kvm_request_pending(vcpu)) return 0; if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7d64f99ea3b8..8b3f1238d07f 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -53,8 +53,8 @@ #define KVM_IRQCHIP_NUM_PINS 256 /* PPC-specific vcpu->requests bit members */ -#define KVM_REQ_WATCHDOG 8 -#define KVM_REQ_EPR_EXIT 9 +#define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0) +#define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1) #include <linux/mmu_notifier.h> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index ffe1da95033a..08b200a0bbce 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1257,8 +1257,8 @@ static void xive_pre_save_scan(struct kvmppc_xive *xive) if (!xc) continue; for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { - if (xc->queues[i].qpage) - xive_pre_save_queue(xive, &xc->queues[i]); + if (xc->queues[j].qpage) + xive_pre_save_queue(xive, &xc->queues[j]); } } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3eaac3809977..071b87ee682f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -687,7 +687,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) kvmppc_core_check_exceptions(vcpu); - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { /* Exception delivery raised request; start over */ return 1; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b14736f3870b..1a75c0b5f4ca 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -55,8 +55,7 @@ EXPORT_SYMBOL_GPL(kvmppc_pr_ops); int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !!(v->arch.pending_exceptions) || - v->requests; + return !!(v->arch.pending_exceptions) || kvm_request_pending(v); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -108,7 +107,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) */ smp_mb(); - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { /* Make sure we process requests preemptable */ local_irq_enable(); trace_kvm_check_requests(vcpu); diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index d0441ad2a990..e508dff92535 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -59,7 +59,9 @@ union ctlreg0 { unsigned long lap : 1; /* Low-address-protection control */ unsigned long : 4; unsigned long edat : 1; /* Enhanced-DAT-enablement control */ - unsigned long : 4; + unsigned long : 2; + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 1; unsigned long afp : 1; /* AFP-register control */ unsigned long vx : 1; /* Vector enablement control */ unsigned long : 7; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 426614a882a9..495aedbaf447 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -42,9 +42,11 @@ #define KVM_HALT_POLL_NS_DEFAULT 80000 /* s390-specific vcpu->requests bit members */ -#define KVM_REQ_ENABLE_IBS 8 -#define KVM_REQ_DISABLE_IBS 9 -#define KVM_REQ_ICPT_OPEREXC 10 +#define KVM_REQ_ENABLE_IBS KVM_ARCH_REQ(0) +#define KVM_REQ_DISABLE_IBS KVM_ARCH_REQ(1) +#define KVM_REQ_ICPT_OPEREXC KVM_ARCH_REQ(2) +#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3) +#define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4) #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -56,7 +58,7 @@ union bsca_sigp_ctrl { __u8 r : 1; __u8 scn : 6; }; -} __packed; +}; union esca_sigp_ctrl { __u16 value; @@ -65,14 +67,14 @@ union esca_sigp_ctrl { __u8 reserved: 7; __u8 scn; }; -} __packed; +}; struct esca_entry { union esca_sigp_ctrl sigp_ctrl; __u16 reserved1[3]; __u64 sda; __u64 reserved2[6]; -} __packed; +}; struct bsca_entry { __u8 reserved0; @@ -80,7 +82,7 @@ struct bsca_entry { __u16 reserved[3]; __u64 sda; __u64 reserved2[2]; -} __attribute__((packed)); +}; union ipte_control { unsigned long val; @@ -97,7 +99,7 @@ struct bsca_block { __u64 mcn; __u64 reserved2; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; -} __attribute__((packed)); +}; struct esca_block { union ipte_control ipte_control; @@ -105,7 +107,21 @@ struct esca_block { __u64 mcn[4]; __u64 reserved2[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; -} __packed; +}; + +/* + * This struct is used to store some machine check info from lowcore + * for machine checks that happen while the guest is running. + * This info in host's lowcore might be overwritten by a second machine + * check from host when host is in the machine check's high-level handling. + * The size is 24 bytes. + */ +struct mcck_volatile_info { + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 reserved; +}; #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 @@ -260,14 +276,15 @@ struct kvm_s390_sie_block { struct kvm_s390_itdb { __u8 data[256]; -} __packed; +}; struct sie_page { struct kvm_s390_sie_block sie_block; - __u8 reserved200[1024]; /* 0x0200 */ + struct mcck_volatile_info mcck_info; /* 0x0200 */ + __u8 reserved218[1000]; /* 0x0218 */ struct kvm_s390_itdb itdb; /* 0x0600 */ __u8 reserved700[2304]; /* 0x0700 */ -} __packed; +}; struct kvm_vcpu_stat { u64 exit_userspace; @@ -681,7 +698,7 @@ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ -} __packed; +}; struct kvm_s390_vsie { struct mutex mutex; @@ -691,6 +708,12 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_migration_state { + unsigned long bitmap_size; /* in bits (number of guest pages) */ + atomic64_t dirty_pages; /* number of dirty pages */ + unsigned long *pgste_bitmap; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -718,6 +741,7 @@ struct kvm_arch{ struct kvm_s390_crypto crypto; struct kvm_s390_vsie vsie; u64 epoch; + struct kvm_s390_migration_state *migration_state; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); }; diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index e3e8895f5d3e..9d91cf3e427f 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -14,11 +14,24 @@ #include <linux/const.h> #include <linux/types.h> +#define MCIC_SUBCLASS_MASK (1ULL<<63 | 1ULL<<62 | 1ULL<<61 | \ + 1ULL<<59 | 1ULL<<58 | 1ULL<<56 | \ + 1ULL<<55 | 1ULL<<54 | 1ULL<<53 | \ + 1ULL<<52 | 1ULL<<47 | 1ULL<<46 | \ + 1ULL<<45 | 1ULL<<44) #define MCCK_CODE_SYSTEM_DAMAGE _BITUL(63) +#define MCCK_CODE_EXT_DAMAGE _BITUL(63 - 5) +#define MCCK_CODE_CP _BITUL(63 - 9) #define MCCK_CODE_CPU_TIMER_VALID _BITUL(63 - 46) #define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20) #define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23) +#define MCCK_CR14_CR_PENDING_SUB_MASK (1 << 28) +#define MCCK_CR14_RECOVERY_SUB_MASK (1 << 27) +#define MCCK_CR14_DEGRAD_SUB_MASK (1 << 26) +#define MCCK_CR14_EXT_DAMAGE_SUB_MASK (1 << 25) +#define MCCK_CR14_WARN_SUB_MASK (1 << 24) + #ifndef __ASSEMBLY__ union mci { diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 60d395fdc864..5b1b247dfbca 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -20,6 +20,7 @@ #define CIF_FPU 4 /* restore FPU registers */ #define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */ #define CIF_ENABLED_WAIT 6 /* in enabled wait state */ +#define CIF_MCCK_GUEST 7 /* machine check happening in guest */ #define _CIF_MCCK_PENDING _BITUL(CIF_MCCK_PENDING) #define _CIF_ASCE_PRIMARY _BITUL(CIF_ASCE_PRIMARY) @@ -28,6 +29,7 @@ #define _CIF_FPU _BITUL(CIF_FPU) #define _CIF_IGNORE_IRQ _BITUL(CIF_IGNORE_IRQ) #define _CIF_ENABLED_WAIT _BITUL(CIF_ENABLED_WAIT) +#define _CIF_MCCK_GUEST _BITUL(CIF_MCCK_GUEST) #ifndef __ASSEMBLY__ diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 3dd2a1d308dd..69d09c39bbcd 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -28,6 +28,7 @@ #define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 #define KVM_DEV_FLIC_AISM 9 #define KVM_DEV_FLIC_AIRQ_INJECT 10 +#define KVM_DEV_FLIC_AISM_ALL 11 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. @@ -53,6 +54,11 @@ struct kvm_s390_ais_req { __u16 mode; }; +struct kvm_s390_ais_all { + __u8 simm; + __u8 nimm; +}; + #define KVM_S390_IO_ADAPTER_MASK 1 #define KVM_S390_IO_ADAPTER_MAP 2 #define KVM_S390_IO_ADAPTER_UNMAP 3 @@ -70,6 +76,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_TOD 1 #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 +#define KVM_S390_VM_MIGRATION 4 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 @@ -151,6 +158,11 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +/* kvm attributes for migration mode */ +#define KVM_S390_VM_MIGRATION_STOP 0 +#define KVM_S390_VM_MIGRATION_START 1 +#define KVM_S390_VM_MIGRATION_STATUS 2 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 6bb29633e1f1..b65c414b6c0e 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -58,6 +58,9 @@ int main(void) OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); OFFSET(__SF_GPRS, stack_frame, gprs); OFFSET(__SF_EMPTY, stack_frame, empty1); + OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]); + OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]); BLANK(); /* timeval/timezone offsets for use by vdso */ OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index e408d9cc5b96..3b4b158382ea 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -225,6 +225,7 @@ ENTRY(sie64a) jnz .Lsie_skip TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lsie_skip # exit if fp/vx regs changed +.Lsie_entry: sie 0(%r14) .Lsie_skip: ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE @@ -1104,7 +1105,13 @@ cleanup_critical: .quad .Lsie_done .Lcleanup_sie: - lg %r9,__SF_EMPTY(%r15) # get control block pointer + cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? + je 1f + slg %r9,BASED(.Lsie_crit_mcck_start) + clg %r9,BASED(.Lsie_crit_mcck_length) + jh 1f + oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST +1: lg %r9,__SF_EMPTY(%r15) # get control block pointer ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit @@ -1289,6 +1296,10 @@ cleanup_critical: .quad .Lsie_gmap .Lsie_critical_length: .quad .Lsie_done - .Lsie_gmap +.Lsie_crit_mcck_start: + .quad .Lsie_entry +.Lsie_crit_mcck_length: + .quad .Lsie_skip - .Lsie_entry #endif .section .rodata, "a" diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 985589523970..31d03a84126c 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -25,6 +25,8 @@ #include <asm/crw.h> #include <asm/switch_to.h> #include <asm/ctl_reg.h> +#include <asm/asm-offsets.h> +#include <linux/kvm_host.h> struct mcck_struct { unsigned int kill_task : 1; @@ -274,12 +276,39 @@ static int notrace s390_validate_registers(union mci mci, int umode) return kill_task; } +/* + * Backup the guest's machine check info to its description block + */ +static void notrace s390_backup_mcck_info(struct pt_regs *regs) +{ + struct mcck_volatile_info *mcck_backup; + struct sie_page *sie_page; + + /* r14 contains the sie block, which was set in sie64a */ + struct kvm_s390_sie_block *sie_block = + (struct kvm_s390_sie_block *) regs->gprs[14]; + + if (sie_block == NULL) + /* Something's seriously wrong, stop system. */ + s390_handle_damage(); + + sie_page = container_of(sie_block, struct sie_page, sie_block); + mcck_backup = &sie_page->mcck_info; + mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); + mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; + mcck_backup->failing_storage_address + = S390_lowcore.failing_storage_address; +} + #define MAX_IPD_COUNT 29 #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ #define ED_STP_ISLAND 6 /* External damage STP island check */ #define ED_STP_SYNC 7 /* External damage STP sync check */ +#define MCCK_CODE_NO_GUEST (MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE) + /* * machine check handler. */ @@ -291,6 +320,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) struct mcck_struct *mcck; unsigned long long tmp; union mci mci; + unsigned long mcck_dam_code; nmi_enter(); inc_irq_stat(NMI_NMI); @@ -301,7 +331,13 @@ void notrace s390_do_machine_check(struct pt_regs *regs) /* System damage -> stopping machine */ s390_handle_damage(); } - if (mci.pd) { + + /* + * Reinject the instruction processing damages' machine checks + * including Delayed Access Exception into the guest + * instead of damaging the host if they happen in the guest. + */ + if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) { if (mci.b) { /* Processing backup -> verify if we can survive this */ u64 z_mcic, o_mcic, t_mcic; @@ -345,6 +381,14 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck->mcck_code = mci.val; set_cpu_flag(CIF_MCCK_PENDING); } + + /* + * Backup the machine check's info if it happens when the guest + * is running. + */ + if (test_cpu_flag(CIF_MCCK_GUEST)) + s390_backup_mcck_info(regs); + if (mci.cd) { /* Timing facility damage */ s390_handle_damage(); @@ -358,15 +402,22 @@ void notrace s390_do_machine_check(struct pt_regs *regs) if (mcck->stp_queue) set_cpu_flag(CIF_MCCK_PENDING); } - if (mci.se) - /* Storage error uncorrected */ - s390_handle_damage(); - if (mci.ke) - /* Storage key-error uncorrected */ - s390_handle_damage(); - if (mci.ds && mci.fa) - /* Storage degradation */ - s390_handle_damage(); + + /* + * Reinject storage related machine checks into the guest if they + * happen when the guest is running. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) { + if (mci.se) + /* Storage error uncorrected */ + s390_handle_damage(); + if (mci.ke) + /* Storage key-error uncorrected */ + s390_handle_damage(); + if (mci.ds && mci.fa) + /* Storage degradation */ + s390_handle_damage(); + } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; @@ -377,6 +428,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck->warning = 1; set_cpu_flag(CIF_MCCK_PENDING); } + + /* + * If there are only Channel Report Pending and External Damage + * machine checks, they will not be reinjected into the guest + * because they refer to host conditions only. + */ + mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK); + if (test_cpu_flag(CIF_MCCK_GUEST) && + (mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) { + /* Set exit reason code for host's later handling */ + *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; + } + clear_cpu_flag(CIF_MCCK_GUEST); nmi_exit(); } diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 9da243d94cc3..17e3a4e71bc9 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -89,7 +89,7 @@ struct region3_table_entry_fc1 { unsigned long f : 1; /* Fetch-Protection Bit */ unsigned long fc : 1; /* Format-Control */ unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long co : 1; /* Change-Recording Override */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ unsigned long : 2; unsigned long i : 1; /* Region-Invalid Bit */ unsigned long cr : 1; /* Common-Region Bit */ @@ -131,7 +131,7 @@ struct segment_entry_fc1 { unsigned long f : 1; /* Fetch-Protection Bit */ unsigned long fc : 1; /* Format-Control */ unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long co : 1; /* Change-Recording Override */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ unsigned long : 2; unsigned long i : 1; /* Segment-Invalid Bit */ unsigned long cs : 1; /* Common-Segment Bit */ @@ -168,7 +168,8 @@ union page_table_entry { unsigned long z : 1; /* Zero Bit */ unsigned long i : 1; /* Page-Invalid Bit */ unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long : 9; + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 8; }; }; @@ -241,7 +242,7 @@ struct ale { unsigned long asteo : 25; /* ASN-Second-Table-Entry Origin */ unsigned long : 6; unsigned long astesn : 32; /* ASTE Sequence Number */ -} __packed; +}; struct aste { unsigned long i : 1; /* ASX-Invalid Bit */ @@ -257,7 +258,7 @@ struct aste { unsigned long ald : 32; unsigned long astesn : 32; /* .. more fields there */ -} __packed; +}; int ipte_lock_held(struct kvm_vcpu *vcpu) { @@ -485,6 +486,7 @@ enum prot_type { PROT_TYPE_KEYC = 1, PROT_TYPE_ALC = 2, PROT_TYPE_DAT = 3, + PROT_TYPE_IEP = 4, }; static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, @@ -500,6 +502,9 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_TYPE_IEP: + tec->b61 = 1; + /* FALL THROUGH */ case PROT_TYPE_LA: tec->b56 = 1; break; @@ -591,6 +596,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * @gpa: points to where guest physical (absolute) address should be stored * @asce: effective asce * @mode: indicates the access mode to be used + * @prot: returns the type for protection exceptions * * Translate a guest virtual address into a guest absolute address by means * of dynamic address translation as specified by the architecture. @@ -606,19 +612,21 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) */ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, unsigned long *gpa, const union asce asce, - enum gacc_mode mode) + enum gacc_mode mode, enum prot_type *prot) { union vaddress vaddr = {.addr = gva}; union raddress raddr = {.addr = gva}; union page_table_entry pte; int dat_protection = 0; + int iep_protection = 0; union ctlreg0 ctlreg0; unsigned long ptr; - int edat1, edat2; + int edat1, edat2, iep; ctlreg0.val = vcpu->arch.sie_block->gcr[0]; edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8); edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78); + iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; ptr = asce.origin * 4096; @@ -702,6 +710,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_TRANSLATION_SPEC; if (rtte.fc && edat2) { dat_protection |= rtte.fc1.p; + iep_protection = rtte.fc1.iep; raddr.rfaa = rtte.fc1.rfaa; goto absolute_address; } @@ -729,6 +738,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_TRANSLATION_SPEC; if (ste.fc && edat1) { dat_protection |= ste.fc1.p; + iep_protection = ste.fc1.iep; raddr.sfaa = ste.fc1.sfaa; goto absolute_address; } @@ -745,12 +755,19 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, if (pte.z) return PGM_TRANSLATION_SPEC; dat_protection |= pte.p; + iep_protection = pte.iep; raddr.pfra = pte.pfra; real_address: raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr); absolute_address: - if (mode == GACC_STORE && dat_protection) + if (mode == GACC_STORE && dat_protection) { + *prot = PROT_TYPE_DAT; return PGM_PROTECTION; + } + if (mode == GACC_IFETCH && iep_protection && iep) { + *prot = PROT_TYPE_IEP; + return PGM_PROTECTION; + } if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) return PGM_ADDRESSING; *gpa = raddr.addr; @@ -782,6 +799,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, { psw_t *psw = &vcpu->arch.sie_block->gpsw; int lap_enabled, rc = 0; + enum prot_type prot; lap_enabled = low_address_protection_enabled(vcpu, asce); while (nr_pages) { @@ -791,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, PROT_TYPE_LA); ga &= PAGE_MASK; if (psw_bits(*psw).t) { - rc = guest_translate(vcpu, ga, pages, asce, mode); + rc = guest_translate(vcpu, ga, pages, asce, mode, &prot); if (rc < 0) return rc; } else { @@ -800,7 +818,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, rc = PGM_ADDRESSING; } if (rc) - return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT); + return trans_exc(vcpu, rc, ga, ar, mode, prot); ga += PAGE_SIZE; pages++; nr_pages--; @@ -886,6 +904,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long *gpa, enum gacc_mode mode) { psw_t *psw = &vcpu->arch.sie_block->gpsw; + enum prot_type prot; union asce asce; int rc; @@ -900,9 +919,9 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, } if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */ - rc = guest_translate(vcpu, gva, gpa, asce, mode); + rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot); if (rc > 0) - return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); + return trans_exc(vcpu, rc, gva, 0, mode, prot); } else { *gpa = kvm_s390_real_to_abs(vcpu, gva); if (kvm_is_error_gpa(vcpu->kvm, *gpa)) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index caf15c8a8948..f2c78fc1852d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -251,8 +251,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* + * Check both floating and local interrupt's cr14 because + * bit IRQ_PEND_MCHK_REP could be set in both cases. + */ if (!(vcpu->arch.sie_block->gcr[14] & - vcpu->kvm->arch.float_int.mchk.cr14)) + (vcpu->kvm->arch.float_int.mchk.cr14 | + vcpu->arch.local_int.irq.mchk.cr14))) __clear_bit(IRQ_PEND_MCHK_REP, &active_mask); /* @@ -1876,6 +1881,28 @@ out: return ret < 0 ? ret : n; } +static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_all ais; + + if (attr->attr < sizeof(ais)) + return -EINVAL; + + if (!test_kvm_facility(kvm, 72)) + return -ENOTSUPP; + + mutex_lock(&fi->ais_lock); + ais.simm = fi->simm; + ais.nimm = fi->nimm; + mutex_unlock(&fi->ais_lock); + + if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais))) + return -EFAULT; + + return 0; +} + static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r; @@ -1885,6 +1912,9 @@ static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr, attr->attr); break; + case KVM_DEV_FLIC_AISM_ALL: + r = flic_ais_mode_get_all(dev->kvm, attr); + break; default: r = -EINVAL; } @@ -2235,6 +2265,25 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr) return kvm_s390_inject_airq(kvm, adapter); } +static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_all ais; + + if (!test_kvm_facility(kvm, 72)) + return -ENOTSUPP; + + if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais))) + return -EFAULT; + + mutex_lock(&fi->ais_lock); + fi->simm = ais.simm; + fi->nimm = ais.nimm; + mutex_unlock(&fi->ais_lock); + + return 0; +} + static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; @@ -2277,6 +2326,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_AIRQ_INJECT: r = flic_inject_airq(dev->kvm, attr); break; + case KVM_DEV_FLIC_AISM_ALL: + r = flic_ais_mode_set_all(dev->kvm, attr); + break; default: r = -EINVAL; } @@ -2298,6 +2350,7 @@ static int flic_has_attr(struct kvm_device *dev, case KVM_DEV_FLIC_CLEAR_IO_IRQ: case KVM_DEV_FLIC_AISM: case KVM_DEV_FLIC_AIRQ_INJECT: + case KVM_DEV_FLIC_AISM_ALL: return 0; } return -ENXIO; @@ -2415,6 +2468,42 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } +/* + * Inject the machine check to the guest. + */ +void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, + struct mcck_volatile_info *mcck_info) +{ + struct kvm_s390_interrupt_info inti; + struct kvm_s390_irq irq; + struct kvm_s390_mchk_info *mchk; + union mci mci; + __u64 cr14 = 0; /* upper bits are not used */ + + mci.val = mcck_info->mcic; + if (mci.sr) + cr14 |= MCCK_CR14_RECOVERY_SUB_MASK; + if (mci.dg) + cr14 |= MCCK_CR14_DEGRAD_SUB_MASK; + if (mci.w) + cr14 |= MCCK_CR14_WARN_SUB_MASK; + + mchk = mci.ck ? &inti.mchk : &irq.u.mchk; + mchk->cr14 = cr14; + mchk->mcic = mcck_info->mcic; + mchk->ext_damage_code = mcck_info->ext_damage_code; + mchk->failing_storage_address = mcck_info->failing_storage_address; + if (mci.ck) { + /* Inject the floating machine check */ + inti.type = KVM_S390_MCHK; + WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti)); + } else { + /* Inject the machine check to specified vcpu */ + irq.type = KVM_S390_MCHK; + WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq)); + } +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 689ac48361c6..ef6419654c16 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -30,6 +30,7 @@ #include <linux/vmalloc.h> #include <linux/bitmap.h> #include <linux/sched/signal.h> +#include <linux/string.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> @@ -386,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: case KVM_CAP_S390_USER_INSTR0: + case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: r = 1; break; @@ -750,6 +752,129 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) +{ + int cx; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(cx, vcpu, kvm) + kvm_s390_sync_request(req, vcpu); +} + +/* + * Must be called with kvm->srcu held to avoid races on memslots, and with + * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + */ +static int kvm_s390_vm_start_migration(struct kvm *kvm) +{ + struct kvm_s390_migration_state *mgs; + struct kvm_memory_slot *ms; + /* should be the only one */ + struct kvm_memslots *slots; + unsigned long ram_pages; + int slotnr; + + /* migration mode already enabled */ + if (kvm->arch.migration_state) + return 0; + + slots = kvm_memslots(kvm); + if (!slots || !slots->used_slots) + return -EINVAL; + + mgs = kzalloc(sizeof(*mgs), GFP_KERNEL); + if (!mgs) + return -ENOMEM; + kvm->arch.migration_state = mgs; + + if (kvm->arch.use_cmma) { + /* + * Get the last slot. They should be sorted by base_gfn, so the + * last slot is also the one at the end of the address space. + * We have verified above that at least one slot is present. + */ + ms = slots->memslots + slots->used_slots - 1; + /* round up so we only use full longs */ + ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); + /* allocate enough bytes to store all the bits */ + mgs->pgste_bitmap = vmalloc(ram_pages / 8); + if (!mgs->pgste_bitmap) { + kfree(mgs); + kvm->arch.migration_state = NULL; + return -ENOMEM; + } + + mgs->bitmap_size = ram_pages; + atomic64_set(&mgs->dirty_pages, ram_pages); + /* mark all the pages in active slots as dirty */ + for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { + ms = slots->memslots + slotnr; + bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages); + } + + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); + } + return 0; +} + +/* + * Must be called with kvm->lock to avoid races with ourselves and + * kvm_s390_vm_start_migration. + */ +static int kvm_s390_vm_stop_migration(struct kvm *kvm) +{ + struct kvm_s390_migration_state *mgs; + + /* migration mode already disabled */ + if (!kvm->arch.migration_state) + return 0; + mgs = kvm->arch.migration_state; + kvm->arch.migration_state = NULL; + + if (kvm->arch.use_cmma) { + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + vfree(mgs->pgste_bitmap); + } + kfree(mgs); + return 0; +} + +static int kvm_s390_vm_set_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + int idx, res = -ENXIO; + + mutex_lock(&kvm->lock); + switch (attr->attr) { + case KVM_S390_VM_MIGRATION_START: + idx = srcu_read_lock(&kvm->srcu); + res = kvm_s390_vm_start_migration(kvm); + srcu_read_unlock(&kvm->srcu, idx); + break; + case KVM_S390_VM_MIGRATION_STOP: + res = kvm_s390_vm_stop_migration(kvm); + break; + default: + break; + } + mutex_unlock(&kvm->lock); + + return res; +} + +static int kvm_s390_vm_get_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u64 mig = (kvm->arch.migration_state != NULL); + + if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) + return -ENXIO; + + if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig))) + return -EFAULT; + return 0; +} + static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) { u8 gtod_high; @@ -1090,6 +1215,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CRYPTO: ret = kvm_s390_vm_set_crypto(kvm, attr); break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_set_migration(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1112,6 +1240,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MODEL: ret = kvm_s390_get_cpu_model(kvm, attr); break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_get_migration(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1179,6 +1310,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) break; } break; + case KVM_S390_VM_MIGRATION: + ret = 0; + break; default: ret = -ENXIO; break; @@ -1286,6 +1420,182 @@ out: return r; } +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* for consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +/* + * This function searches for the next page with dirty CMMA attributes, and + * saves the attributes in the buffer up to either the end of the buffer or + * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found; + * no trailing clean bytes are saved. + * In case no dirty bits were found, or if CMMA was not enabled or used, the + * output buffer will indicate 0 as length. + */ +static int kvm_s390_get_cmma_bits(struct kvm *kvm, + struct kvm_s390_cmma_log *args) +{ + struct kvm_s390_migration_state *s = kvm->arch.migration_state; + unsigned long bufsize, hva, pgstev, i, next, cur; + int srcu_idx, peek, r = 0, rr; + u8 *res; + + cur = args->start_gfn; + i = next = pgstev = 0; + + if (unlikely(!kvm->arch.use_cmma)) + return -ENXIO; + /* Invalid/unsupported flags were specified */ + if (args->flags & ~KVM_S390_CMMA_PEEK) + return -EINVAL; + /* Migration mode query, and we are not doing a migration */ + peek = !!(args->flags & KVM_S390_CMMA_PEEK); + if (!peek && !s) + return -EINVAL; + /* CMMA is disabled or was not used, or the buffer has length zero */ + bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!bufsize || !kvm->mm->context.use_cmma) { + memset(args, 0, sizeof(*args)); + return 0; + } + + if (!peek) { + /* We are not peeking, and there are no dirty pages */ + if (!atomic64_read(&s->dirty_pages)) { + memset(args, 0, sizeof(*args)); + return 0; + } + cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, + args->start_gfn); + if (cur >= s->bitmap_size) /* nothing found, loop back */ + cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0); + if (cur >= s->bitmap_size) { /* again! (very unlikely) */ + memset(args, 0, sizeof(*args)); + return 0; + } + next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1); + } + + res = vmalloc(bufsize); + if (!res) + return -ENOMEM; + + args->start_gfn = cur; + + down_read(&kvm->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); + while (i < bufsize) { + hva = gfn_to_hva(kvm, cur); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + /* decrement only if we actually flipped the bit to 0 */ + if (!peek && test_and_clear_bit(cur, s->pgste_bitmap)) + atomic64_dec(&s->dirty_pages); + r = get_pgste(kvm->mm, hva, &pgstev); + if (r < 0) + pgstev = 0; + /* save the value */ + res[i++] = (pgstev >> 24) & 0x3; + /* + * if the next bit is too far away, stop. + * if we reached the previous "next", find the next one + */ + if (!peek) { + if (next > cur + KVM_S390_MAX_BIT_DISTANCE) + break; + if (cur == next) + next = find_next_bit(s->pgste_bitmap, + s->bitmap_size, cur + 1); + /* reached the end of the bitmap or of the buffer, stop */ + if ((next >= s->bitmap_size) || + (next >= args->start_gfn + bufsize)) + break; + } + cur++; + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + up_read(&kvm->mm->mmap_sem); + args->count = i; + args->remaining = s ? atomic64_read(&s->dirty_pages) : 0; + + rr = copy_to_user((void __user *)args->values, res, args->count); + if (rr) + r = -EFAULT; + + vfree(res); + return r; +} + +/* + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.use_cmma flag is set. + */ +static int kvm_s390_set_cmma_bits(struct kvm *kvm, + const struct kvm_s390_cmma_log *args) +{ + unsigned long hva, mask, pgstev, i; + uint8_t *bits; + int srcu_idx, r = 0; + + mask = args->mask; + + if (!kvm->arch.use_cmma) + return -ENXIO; + /* invalid/unsupported flags */ + if (args->flags != 0) + return -EINVAL; + /* Enforce sane limit on memory allocation */ + if (args->count > KVM_S390_CMMA_SIZE_MAX) + return -EINVAL; + /* Nothing to do */ + if (args->count == 0) + return 0; + + bits = vmalloc(sizeof(*bits) * args->count); + if (!bits) + return -ENOMEM; + + r = copy_from_user(bits, (void __user *)args->values, args->count); + if (r) { + r = -EFAULT; + goto out; + } + + down_read(&kvm->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + + pgstev = bits[i]; + pgstev = pgstev << 24; + mask &= _PGSTE_GPS_USAGE_MASK; + set_pgste_bits(kvm->mm, hva, mask, pgstev); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + up_read(&kvm->mm->mmap_sem); + + if (!kvm->mm->context.use_cmma) { + down_write(&kvm->mm->mmap_sem); + kvm->mm->context.use_cmma = 1; + up_write(&kvm->mm->mmap_sem); + } +out: + vfree(bits); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1364,6 +1674,29 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_set_skeys(kvm, &args); break; } + case KVM_S390_GET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + r = kvm_s390_get_cmma_bits(kvm, &args); + if (!r) { + r = copy_to_user(argp, &args, sizeof(args)); + if (r) + r = -EFAULT; + } + break; + } + case KVM_S390_SET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + r = kvm_s390_set_cmma_bits(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -1633,6 +1966,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); + if (kvm->arch.migration_state) { + vfree(kvm->arch.migration_state->pgste_bitmap); + kfree(kvm->arch.migration_state); + } KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } @@ -1977,7 +2314,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) if (!vcpu->arch.sie_block->cbrlo) return -ENOMEM; - vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI; return 0; } @@ -2069,6 +2405,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (!vcpu) goto out; + BUILD_BUG_ON(sizeof(struct sie_page) != 4096); sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); if (!sie_page) goto out_free_cpu; @@ -2440,7 +2777,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { retry: kvm_s390_vcpu_request_handled(vcpu); - if (!vcpu->requests) + if (!kvm_request_pending(vcpu)) return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the @@ -2489,6 +2826,27 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) { + /* + * Disable CMMA virtualization; we will emulate the ESSA + * instruction manually, in order to provide additional + * functionalities needed for live migration. + */ + vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA; + goto retry; + } + + if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) { + /* + * Re-enable CMMA virtualization if CMMA is available and + * was used. + */ + if ((vcpu->kvm->arch.use_cmma) && + (vcpu->kvm->mm->context.use_cmma)) + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + goto retry; + } + /* nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_UNHALT, vcpu); @@ -2683,6 +3041,9 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) { + struct mcck_volatile_info *mcck_info; + struct sie_page *sie_page; + VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); @@ -2693,6 +3054,15 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14; vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15; + if (exit_reason == -EINTR) { + VCPU_EVENT(vcpu, 3, "%s", "machine check"); + sie_page = container_of(vcpu->arch.sie_block, + struct sie_page, sie_block); + mcck_info = &sie_page->mcck_info; + kvm_s390_reinject_machine_check(vcpu, mcck_info); + return 0; + } + if (vcpu->arch.sie_block->icptcode > 0) { int rc = kvm_handle_sie_intercept(vcpu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 55f5c8457d6d..6fedc8bc7a37 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -397,4 +397,6 @@ static inline int kvm_s390_use_sca_entries(void) */ return sclp.has_sigpif; } +void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, + struct mcck_volatile_info *mcck_info); #endif diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c03106c428cf..a226c459809b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -24,6 +24,7 @@ #include <asm/ebcdic.h> #include <asm/sysinfo.h> #include <asm/pgtable.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/io.h> @@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return 0; } +static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) +{ + struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state; + int r1, r2, nappended, entries; + unsigned long gfn, hva, res, pgstev, ptev; + unsigned long *cbrlo; + + /* + * We don't need to set SD.FPF.SK to 1 here, because if we have a + * machine check here we either handle it or crash + */ + + kvm_s390_get_regs_rre(vcpu, &r1, &r2); + gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; + hva = gfn_to_hva(vcpu->kvm, gfn); + entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; + + if (kvm_is_error_hva(hva)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); + if (nappended < 0) { + res = orc ? 0x10 : 0; + vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + return 0; + } + res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; + /* + * Set the block-content state part of the result. 0 means resident, so + * nothing to do if the page is valid. 2 is for preserved pages + * (non-present and non-zero), and 3 for zero pages (non-present and + * zero). + */ + if (ptev & _PAGE_INVALID) { + res |= 2; + if (pgstev & _PGSTE_GPS_ZERO) + res |= 1; + } + vcpu->run->s.regs.gprs[r1] = res; + /* + * It is possible that all the normal 511 slots were full, in which case + * we will now write in the 512th slot, which is reserved for host use. + * In both cases we let the normal essa handling code process all the + * slots, including the reserved one, if needed. + */ + if (nappended > 0) { + cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK); + cbrlo[entries] = gfn << PAGE_SHIFT; + } + + if (orc) { + /* increment only if we are really flipping the bit to 1 */ + if (!test_and_set_bit(gfn, ms->pgste_bitmap)) + atomic64_inc(&ms->dirty_pages); + } + + return nappended; +} + static int handle_essa(struct kvm_vcpu *vcpu) { /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; struct gmap *gmap; - int i; + int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); gmap = vcpu->arch.gmap; @@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - - if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6) + /* Check for invalid operation request code */ + orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; + if (orc > ESSA_MAX) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* Retry the ESSA instruction */ - kvm_s390_retry_instr(vcpu); + if (likely(!vcpu->kvm->arch.migration_state)) { + /* + * CMMA is enabled in the KVM settings, but is disabled in + * the SIE block and in the mm_context, and we are not doing + * a migration. Enable CMMA in the mm_context. + * Since we need to take a write lock to write to the context + * to avoid races with storage keys handling, we check if the + * value really needs to be written to; if the value is + * already correct, we do nothing and avoid the lock. + */ + if (vcpu->kvm->mm->context.use_cmma == 0) { + down_write(&vcpu->kvm->mm->mmap_sem); + vcpu->kvm->mm->context.use_cmma = 1; + up_write(&vcpu->kvm->mm->mmap_sem); + } + /* + * If we are here, we are supposed to have CMMA enabled in + * the SIE block. Enabling CMMA works on a per-CPU basis, + * while the context use_cmma flag is per process. + * It's possible that the context flag is enabled and the + * SIE flag is not, so we set the flag always; if it was + * already set, nothing changes, otherwise we enable it + * on this CPU too. + */ + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + /* Retry the ESSA instruction */ + kvm_s390_retry_instr(vcpu); + } else { + /* Account for the possible extra cbrl entry */ + i = do_essa(vcpu, orc); + if (i < 0) + return i; + entries += i; + } vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); down_read(&gmap->mm->mmap_sem); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4719ecb9ab42..715c19c45d9a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -26,16 +26,21 @@ struct vsie_page { struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* + * the backup info for machine check. ensure it's at + * the same offset as that in struct sie_page! + */ + struct mcck_volatile_info mcck_info; /* 0x0200 */ /* the pinned originial scb */ - struct kvm_s390_sie_block *scb_o; /* 0x0200 */ + struct kvm_s390_sie_block *scb_o; /* 0x0218 */ /* the shadow gmap in use by the vsie_page */ - struct gmap *gmap; /* 0x0208 */ + struct gmap *gmap; /* 0x0220 */ /* address of the last reported fault to guest2 */ - unsigned long fault_addr; /* 0x0210 */ - __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ + unsigned long fault_addr; /* 0x0228 */ + __u8 reserved[0x0700 - 0x0230]; /* 0x0230 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ -} __packed; +}; /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, @@ -801,6 +806,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct mcck_volatile_info *mcck_info; + struct sie_page *sie_page; int rc; handle_last_fault(vcpu, vsie_page); @@ -822,6 +829,14 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + if (rc == -EINTR) { + VCPU_EVENT(vcpu, 3, "%s", "machine check"); + sie_page = container_of(scb_s, struct sie_page, sie_block); + mcck_info = &sie_page->mcck_info; + kvm_s390_reinject_machine_check(vcpu, mcck_info); + return 0; + } + if (rc > 0) rc = 0; /* we could still have an icpt */ else if (rc == -EFAULT) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 695605eb1dfb..9be890893885 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,28 +48,31 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS /* x86-specific vcpu->requests bit members */ -#define KVM_REQ_MIGRATE_TIMER 8 -#define KVM_REQ_REPORT_TPR_ACCESS 9 -#define KVM_REQ_TRIPLE_FAULT 10 -#define KVM_REQ_MMU_SYNC 11 -#define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_EVENT 14 -#define KVM_REQ_APF_HALT 15 -#define KVM_REQ_STEAL_UPDATE 16 -#define KVM_REQ_NMI 17 -#define KVM_REQ_PMU 18 -#define KVM_REQ_PMI 19 -#define KVM_REQ_SMI 20 -#define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_HV_CRASH 26 -#define KVM_REQ_IOAPIC_EOI_EXIT 27 -#define KVM_REQ_HV_RESET 28 -#define KVM_REQ_HV_EXIT 29 -#define KVM_REQ_HV_STIMER 30 +#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) +#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) +#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) +#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) +#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_EVENT KVM_ARCH_REQ(6) +#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) +#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) +#define KVM_REQ_NMI KVM_ARCH_REQ(9) +#define KVM_REQ_PMU KVM_ARCH_REQ(10) +#define KVM_REQ_PMI KVM_ARCH_REQ(11) +#define KVM_REQ_SMI KVM_ARCH_REQ(12) +#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) +#define KVM_REQ_MCLOCK_INPROGRESS \ + KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC \ + KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) +#define KVM_REQ_APIC_PAGE_RELOAD \ + KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18) +#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19) +#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) +#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) +#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 673f9ac50f6d..dbf266b0d14a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -420,6 +420,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a6fd40aade7c..da6728383052 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0816ab2e8adc..4a38b9656391 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) @@ -3941,6 +3941,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) } /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save + * and restore MXCSR. + */ +static size_t __fxstate_size(int nregs) +{ + return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; +} + +static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) +{ + bool cr4_osfxsr; + if (ctxt->mode == X86EMUL_MODE_PROT64) + return __fxstate_size(16); + + cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; + return __fxstate_size(cr4_osfxsr ? 8 : 0); +} + +/* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode @@ -3961,7 +3980,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; - size_t size; int rc; rc = check_fxsr(ctxt); @@ -3977,68 +3995,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) - size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); - else - size = offsetof(struct fxregs_state, xmm_space[0]); - - return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -} - -static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, - struct fxregs_state *new) -{ - int rc = X86EMUL_CONTINUE; - struct fxregs_state old; - - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* - * 64 bit host will restore XMM 8-15, which is not correct on non-64 - * bit guests. Load the current values in order to preserve 64 bit - * XMMs after fxrstor. - */ -#ifdef CONFIG_X86_64 - /* XXX: accessing XMM 8-15 very awkwardly */ - memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); -#endif - - /* - * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but - * does save and restore MXCSR. - */ - if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) - memcpy(new->xmm_space, old.xmm_space, 8 * 16); - - return rc; + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, + fxstate_size(ctxt)); } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; + size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); - if (rc != X86EMUL_CONTINUE) - return rc; + ctxt->ops->get_fpu(ctxt); - if (fx_state.mxcsr >> 16) - return emulate_gp(ctxt, 0); + size = fxstate_size(ctxt); + if (size < __fxstate_size(16)) { + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + if (rc != X86EMUL_CONTINUE) + goto out; + } - ctxt->ops->get_fpu(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + goto out; - if (ctxt->mode < X86EMUL_MODE_PROT64) - rc = fxrstor_fixup(ctxt, &fx_state); + if (fx_state.mxcsr >> 16) { + rc = emulate_gp(ctxt, 0); + goto out; + } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); +out: ctxt->ops->put_fpu(ctxt); return rc; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c329d2894905..2819d4c123eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1495,31 +1495,65 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; + preempt_enable(); } static bool start_hv_timer(struct kvm_lapic *apic) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + int r; - if ((atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); + if (!kvm_x86_ops->set_hv_timer) + return false; + + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return false; - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) - cancel_hv_timer(apic); + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); + if (r < 0) + return false; + + ktimer->hv_timer_in_use = true; + hrtimer_cancel(&ktimer->timer); + + /* + * Also recheck ktimer->pending, in case the sw timer triggered in + * the window. For periodic timer, leave the hv timer running for + * simplicity, and the deadline will be recomputed on the next vmexit. + */ + if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { + if (r) + apic_timer_expired(apic); + return false; } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; + + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + return true; +} + +static void start_sw_timer(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); +} + +static void restart_apic_timer(struct kvm_lapic *apic) +{ + if (!start_hv_timer(apic)) + start_sw_timer(apic); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) @@ -1533,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); - if (!start_hv_timer(apic)) - start_sw_period(apic); + restart_apic_timer(apic); } } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - start_hv_timer(apic); + restart_apic_timer(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1554,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_timer(apic); + if (apic->lapic_timer.hv_timer_in_use) + start_sw_timer(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - if (atomic_read(&apic->lapic_timer.pending)) - return; +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - start_sw_period(apic); - else if (apic_lvtt_tscdeadline(apic)) - start_sw_tscdeadline(apic); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + restart_apic_timer(apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { atomic_set(&apic->lapic_timer.pending, 0); - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - if (set_target_expiration(apic) && - !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_period(apic); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_tscdeadline(apic); - } + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic)) + return; + + restart_apic_timer(apic); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -1811,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1934,7 +1948,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < KVM_APIC_LVT_NUM; i++) kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) + if (kvm_vcpu_is_reset_bsp(vcpu) && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bcbe811f3b97..29caa2c3dff9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 183ddb235fb4..03df7c1da581 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -189,6 +189,7 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; unsigned int3_injected; unsigned long int3_rip; @@ -963,6 +964,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +static void disable_nmi_singlestep(struct vcpu_svm *svm) +{ + svm->nmi_singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { + /* Clear our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; + } +} + /* Note: * This hash table is used to map VM_ID to a struct kvm_arch, * when handling AMD IOMMU GALOG notification to schedule in @@ -1712,11 +1725,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return to_svm(vcpu)->vmcb->save.rflags; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long rflags = svm->vmcb->save.rflags; + + if (svm->nmi_singlestep) { + /* Hide our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + rflags &= ~X86_EFLAGS_RF; + } + return rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (to_svm(vcpu)->nmi_singlestep) + rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + /* * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), @@ -1807,7 +1833,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, * AMD's VMCB does not have an explicit unusable field, so emulate it * for cross vendor migration purposes by "not present" */ - var->unusable = !var->present || (var->type == 0); + var->unusable = !var->present; switch (seg) { case VCPU_SREG_TR: @@ -1840,6 +1866,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + /* This is symmetric with svm_set_segment() */ var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } @@ -1980,18 +2007,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->base = var->base; s->limit = var->limit; s->selector = var->selector; - if (var->unusable) - s->attrib = 0; - else { - s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); - s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; - s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; - s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; - s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; - s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; - s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; - s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; - } + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; /* * This is always accurate, except if SYSRET returned to a segment @@ -2000,7 +2023,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, * would entail passing the CPL to userspace and back. */ if (seg == VCPU_SREG_SS) - svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + /* This is symmetric with svm_get_segment() */ + svm->vmcb->save.cpl = (var->dpl & 3); mark_dirty(svm->vmcb, VMCB_SEG); } @@ -2113,10 +2137,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + disable_nmi_singlestep(svm); } if (svm->vcpu.guest_debug & @@ -2371,8 +2392,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -2382,7 +2403,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 1; } - return 0; + return 0; } static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, @@ -2535,6 +2556,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -2590,8 +2636,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && svm->apf_reason != 0) @@ -4628,10 +4678,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ + if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) + return; /* STGI will cause a vm exit */ + + if (svm->nested.exit_required) + return; /* we're not going to run the guest yet */ + /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) */ + svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } @@ -4772,6 +4829,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + /* + * Disable singlestep if we're injecting an interrupt/exception. + * We don't want our modified rflags to be pushed on the stack where + * we might not be able to easily reset them if we disabled NMI + * singlestep later. + */ + if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { + /* + * Event injection happens before external interrupts cause a + * vmexit and interrupts are disabled here, so smp_send_reschedule + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); + smp_send_reschedule(vcpu->cpu); + } + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 72f78396bc09..92ddea08f999 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2769,7 +2769,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept_ad_bits) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -3195,7 +3195,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || !guest_cpuid_has_mpx(vcpu)) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3277,7 +3277,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || !guest_cpuid_has_mpx(vcpu)) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -6547,7 +6550,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -6914,97 +6916,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * This function performs the various checks including - * - if it's 4KB aligned - * - No bits beyond the physical address width are set - * - Returns 0 on success or else 1 - * (Intel SDM Section 30.3) - */ -static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, - gpa_t *vmpointer) +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) { gva_t gva; - gpa_t vmptr; struct x86_exception e; - struct page *page; - struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer, + sizeof(*vmpointer), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } - switch (exit_reason) { - case EXIT_REASON_VMON: - /* - * SDM 3: 24.11.5 - * The first 4 bytes of VMXON region contain the supported - * VMCS revision identifier - * - * Note - IA32_VMX_BASIC[48] will never be 1 - * for the nested case; - * which replaces physical address width with 32 - * - */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - nested_release_page_clean(page); - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - kunmap(page); - nested_release_page_clean(page); - vmx->nested.vmxon_ptr = vmptr; - break; - case EXIT_REASON_VMCLEAR: - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } - break; - case EXIT_REASON_VMPTRLD: - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } - break; - default: - return 1; /* shouldn't happen */ - } - - if (vmpointer) - *vmpointer = vmptr; return 0; } @@ -7066,6 +6992,8 @@ out_msr_bitmap: static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; + gpa_t vmptr; + struct page *page; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -7095,9 +7023,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - + + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; + * which replaces physical address width with 32 + */ + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + if (*(u32 *)kmap(page) != VMCS12_REVISION) { + kunmap(page); + nested_release_page_clean(page); + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + kunmap(page); + nested_release_page_clean(page); + + vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); if (ret) return ret; @@ -7213,9 +7169,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); + return kvm_skip_emulated_instruction(vcpu); + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); + } + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); @@ -7545,9 +7511,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); + return kvm_skip_emulated_instruction(vcpu); + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); + } + if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; @@ -7677,7 +7653,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) unsigned long type, types; gva_t gva; struct x86_exception e; - int vpid; + struct { + u64 vpid; + u64 gla; + } operand; if (!(vmx->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -7707,17 +7686,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, - sizeof(u32), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } + if (operand.vpid >> 16) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (is_noncanonical_address(operand.gla)) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + /* fall through */ case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!vpid) { + if (!operand.vpid) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return kvm_skip_emulated_instruction(vcpu); @@ -7913,11 +7903,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; - int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_readl(vcpu, reg); + int reg; + unsigned long val; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ + reg = (exit_qualification >> 8) & 15; + val = kvm_register_readl(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & @@ -7972,6 +7964,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, * lmsw can change bits 1..3 of cr0, and only set bit 0 of * cr0. Other attempted changes are ignored, with no exit. */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) return true; @@ -10733,8 +10726,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } - if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10759,8 +10751,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); } /* @@ -11157,7 +11147,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) vmx->hv_deadline_tsc = tscl + delta_tsc; vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; + + return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02363e37d4a6..f2bd155883dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2841,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_target_expiration_tsc(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_restart_hv_timer(vcpu); + /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -6731,7 +6731,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6895,7 +6895,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); } - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -8394,10 +8394,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; - if (atomic_read(&vcpu->arch.nmi_queued)) + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_ops->nmi_allowed(vcpu))) return true; - if (kvm_test_request(KVM_REQ_SMI, vcpu)) + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && !is_smm(vcpu))) return true; if (kvm_arch_interrupt_allowed(vcpu) && |