diff options
120 files changed, 2891 insertions, 1605 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c6212c2d5fe3..a6729c8cf063 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3357,6 +3357,7 @@ flags which can include the following: - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] + - KVM_GUESTDBG_BLOCKIRQ: avoid injecting interrupts/NMI/SMI [x86] For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints are enabled in memory so we need to ensure breakpoint exceptions are @@ -5208,6 +5209,9 @@ by a string of size ``name_size``. #define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST #define KVM_STATS_UNIT_SHIFT 4 #define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) @@ -5215,18 +5219,20 @@ by a string of size ``name_size``. #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) + #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) #define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT) #define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT) + #define KVM_STATS_BASE_MAX KVM_STATS_BASE_POW2 struct kvm_stats_desc { __u32 flags; __s16 exponent; __u16 size; __u32 offset; - __u32 unused; + __u32 bucket_size; char name[]; }; @@ -5237,21 +5243,35 @@ The following flags are supported: Bits 0-3 of ``flags`` encode the type: * ``KVM_STATS_TYPE_CUMULATIVE`` - The statistics data is cumulative. The value of data can only be increased. + The statistics reports a cumulative count. The value of data can only be increased. Most of the counters used in KVM are of this type. The corresponding ``size`` field for this type is always 1. All cumulative statistics data are read/write. * ``KVM_STATS_TYPE_INSTANT`` - The statistics data is instantaneous. Its value can be increased or + The statistics reports an instantaneous value. Its value can be increased or decreased. This type is usually used as a measurement of some resources, like the number of dirty pages, the number of large pages, etc. All instant statistics are read only. The corresponding ``size`` field for this type is always 1. * ``KVM_STATS_TYPE_PEAK`` - The statistics data is peak. The value of data can only be increased, and - represents a peak value for a measurement, for example the maximum number + The statistics data reports a peak value, for example the maximum number of items in a hash table bucket, the longest time waited and so on. + The value of data can only be increased. The corresponding ``size`` field for this type is always 1. + * ``KVM_STATS_TYPE_LINEAR_HIST`` + The statistic is reported as a linear histogram. The number of + buckets is specified by the ``size`` field. The size of buckets is specified + by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``) + is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last + bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity + value.) The bucket value indicates how many samples fell in the bucket's range. + * ``KVM_STATS_TYPE_LOG_HIST`` + The statistic is reported as a logarithmic histogram. The number of + buckets is specified by the ``size`` field. The range of the first bucket is + [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF). + Otherwise, The Nth bucket (1 < N < ``size``) covers + [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell + in the bucket's range. Bits 4-7 of ``flags`` encode the unit: @@ -5286,9 +5306,9 @@ unsigned 64bit data. The ``offset`` field is the offset from the start of Data Block to the start of the corresponding statistics data. -The ``unused`` field is reserved for future support for other types of -statistics data, like log/linear histogram. Its value is always 0 for the types -defined above. +The ``bucket_size`` field is used as a parameter for histogram statistics data. +It is only used by linear histogram statistics data, specifying the size of a +bucket. The ``name`` field is the name string of the statistics data. The name string starts at the end of ``struct kvm_stats_desc``. The maximum length including diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 88fa495abbac..5d27da356836 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows: can be taken inside a kvm->srcu read-side critical section, while kvm->slots_lock cannot. +- kvm->mn_active_invalidate_count ensures that pairs of + invalidate_range_start() and invalidate_range_end() callbacks + use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock + are taken on the waiting side in install_new_memslots, so MMU notifiers + must not take either kvm->slots_lock or kvm->slots_arch_lock. + On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index cdfa2a242e9f..ef6be92b1921 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); - return val == ID_AA64PFR0_EL1_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); - return val == ID_AA64PFR0_EL0_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_sve(u64 pfr0) @@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: return 48; - case 6: return 52; + case ID_AA64MMFR0_PARANGE_32: return 32; + case ID_AA64MMFR0_PARANGE_36: return 36; + case ID_AA64MMFR0_PARANGE_40: return 40; + case ID_AA64MMFR0_PARANGE_42: return 42; + case ID_AA64MMFR0_PARANGE_44: return 44; + case ID_AA64MMFR0_PARANGE_48: return 48; + case ID_AA64MMFR0_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d436831dd706..327120c0089f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -12,8 +12,13 @@ #include <asm/types.h> /* Hyp Configuration Register (HCR) bits */ + +#define HCR_TID5 (UL(1) << 58) +#define HCR_DCT (UL(1) << 57) #define HCR_ATA_SHIFT 56 #define HCR_ATA (UL(1) << HCR_ATA_SHIFT) +#define HCR_AMVOFFEN (UL(1) << 51) +#define HCR_FIEN (UL(1) << 47) #define HCR_FWB (UL(1) << 46) #define HCR_API (UL(1) << 41) #define HCR_APK (UL(1) << 40) @@ -32,9 +37,9 @@ #define HCR_TVM (UL(1) << 26) #define HCR_TTLB (UL(1) << 25) #define HCR_TPU (UL(1) << 24) -#define HCR_TPC (UL(1) << 23) +#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */ #define HCR_TSW (UL(1) << 22) -#define HCR_TAC (UL(1) << 21) +#define HCR_TACR (UL(1) << 21) #define HCR_TIDCP (UL(1) << 20) #define HCR_TSC (UL(1) << 19) #define HCR_TID3 (UL(1) << 18) @@ -56,12 +61,13 @@ #define HCR_PTW (UL(1) << 2) #define HCR_SWIO (UL(1) << 1) #define HCR_VM (UL(1) << 0) +#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39)) /* * The bits we set in HCR: * TLOR: Trap LORegion register accesses * RW: 64bit by default, can be overridden for 32bit VMs - * TAC: Trap ACTLR + * TACR: Trap ACTLR * TSC: Trap SMC * TSW: Trap cache operations by set/way * TWE: Trap WFE @@ -76,7 +82,7 @@ * PTW: Take a stage2 fault if a stage1 walk steps in device memory */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ - HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) @@ -275,24 +281,40 @@ #define CPTR_EL2_TTA (1 << 20) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) -#define CPTR_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 */ -#define CPTR_EL2_DEFAULT CPTR_EL2_RES1 +#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ +#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 +#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(29, 21) | \ + GENMASK(19, 14) | \ + BIT(11)) /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) -#define MDCR_EL2_TTRF (1 << 19) -#define MDCR_EL2_TPMS (1 << 14) +#define MDCR_EL2_HPMFZS (UL(1) << 36) +#define MDCR_EL2_HPMFZO (UL(1) << 29) +#define MDCR_EL2_MTPME (UL(1) << 28) +#define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HCCD (UL(1) << 23) +#define MDCR_EL2_TTRF (UL(1) << 19) +#define MDCR_EL2_HPMD (UL(1) << 17) +#define MDCR_EL2_TPMS (UL(1) << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) -#define MDCR_EL2_TDRA (1 << 11) -#define MDCR_EL2_TDOSA (1 << 10) -#define MDCR_EL2_TDA (1 << 9) -#define MDCR_EL2_TDE (1 << 8) -#define MDCR_EL2_HPME (1 << 7) -#define MDCR_EL2_TPM (1 << 6) -#define MDCR_EL2_TPMCR (1 << 5) -#define MDCR_EL2_HPMN_MASK (0x1F) +#define MDCR_EL2_TDRA (UL(1) << 11) +#define MDCR_EL2_TDOSA (UL(1) << 10) +#define MDCR_EL2_TDA (UL(1) << 9) +#define MDCR_EL2_TDE (UL(1) << 8) +#define MDCR_EL2_HPME (UL(1) << 7) +#define MDCR_EL2_TPM (UL(1) << 6) +#define MDCR_EL2_TPMCR (UL(1) << 5) +#define MDCR_EL2_HPMN_MASK (UL(0x1F)) +#define MDCR_EL2_RES0 (GENMASK(63, 37) | \ + GENMASK(35, 30) | \ + GENMASK(25, 24) | \ + GENMASK(22, 20) | \ + BIT(18) | \ + GENMASK(16, 15)) /* For compatibility with fault code shared with 32-bit */ #define FSC_FAULT ESR_ELx_FSC_FAULT diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9f0bf2109be7..e86045ac43ba 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -59,12 +59,11 @@ #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 #define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16 +#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ @@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); -extern u32 __kvm_get_mdcr_el2(void); +extern u64 __kvm_get_mdcr_el2(void); #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..f8be56d5342b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; int kvm_arm_init_sve(void); -int __attribute_const__ kvm_target_cpu(void); +u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); @@ -185,7 +185,6 @@ enum vcpu_sysreg { PMCNTENSET_EL0, /* Count Enable Set Register */ PMINTENSET_EL1, /* Interrupt Enable Set Register */ PMOVSSET_EL0, /* Overflow Flag Status Set Register */ - PMSWINC_EL0, /* Software Increment Register */ PMUSERENR_EL0, /* User Enable Register */ /* Pointer Authentication Registers in a strict increasing order. */ @@ -287,9 +286,13 @@ struct kvm_vcpu_arch { /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; - /* HYP configuration */ + /* Values of trap registers for the guest. */ u64 hcr_el2; - u32 mdcr_el2; + u64 mdcr_el2; + u64 cptr_el2; + + /* Values of trap registers for the host before guest entry. */ + u64 mdcr_el2_host; /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -576,6 +579,7 @@ struct kvm_vcpu_stat { u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; + u64 signal_exits; u64 exits; }; @@ -771,6 +775,11 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +static inline bool kvm_vm_is_protected(struct kvm *kvm) +{ + return false; +} + int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 9d60b3006efc..657d0c94cf82 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr); #ifndef __KVM_NVHE_HYPERVISOR__ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); -void deactivate_traps_vhe_put(void); +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu); #endif u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b52c5c4b9a3d..02d378887743 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -252,6 +252,11 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * When this is (directly or indirectly) used on the TLB invalidation + * path, we rely on a previously issued DSB so that page table updates + * and VMID reads are correctly ordered. + */ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; @@ -259,7 +264,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; - vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT; return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } @@ -267,9 +272,10 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ -static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr) +static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, + struct kvm_arch *arch) { - write_sysreg(vtcr, vtcr_el2); + write_sysreg(arch->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* @@ -280,11 +286,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) -{ - __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr); -} - static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) { return container_of(mmu->arch, struct kvm, arch); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index f004c0115d89..027783829584 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +static inline bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static inline u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static inline u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static inline u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static inline bool kvm_level_supports_block_mapping(u32 level) +{ + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. @@ -76,30 +116,15 @@ enum kvm_pgtable_stage2_flags { }; /** - * struct kvm_pgtable - KVM page-table. - * @ia_bits: Maximum input address size, in bits. - * @start_level: Level at which the page-table walk starts. - * @pgd: Pointer to the first top-level entry of the page-table. - * @mm_ops: Memory management callbacks. - * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. - */ -struct kvm_pgtable { - u32 ia_bits; - u32 start_level; - kvm_pte_t *pgd; - struct kvm_pgtable_mm_ops *mm_ops; - - /* Stage-2 only */ - struct kvm_s2_mmu *mmu; - enum kvm_pgtable_stage2_flags flags; -}; - -/** * enum kvm_pgtable_prot - Page-table permissions and attributes. * @KVM_PGTABLE_PROT_X: Execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_SW0: Software bit 0. + * @KVM_PGTABLE_PROT_SW1: Software bit 1. + * @KVM_PGTABLE_PROT_SW2: Software bit 2. + * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_X = BIT(0), @@ -107,21 +132,48 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), + + KVM_PGTABLE_PROT_SW0 = BIT(55), + KVM_PGTABLE_PROT_SW1 = BIT(56), + KVM_PGTABLE_PROT_SW2 = BIT(57), + KVM_PGTABLE_PROT_SW3 = BIT(58), }; -#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X) + +#define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX +#define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW + +#define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) +typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, + enum kvm_pgtable_prot prot); + /** - * struct kvm_mem_range - Range of Intermediate Physical Addresses - * @start: Start of the range. - * @end: End of the range. + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mm_ops: Memory management callbacks. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + * @flags: Stage-2 page-table flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. */ -struct kvm_mem_range { - u64 start; - u64 end; +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pte_t *pgd; + struct kvm_pgtable_mm_ops *mm_ops; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; + enum kvm_pgtable_stage2_flags flags; + kvm_pgtable_force_pte_cb_t force_pte_cb; }; /** @@ -216,21 +268,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** - * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table. + * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @arch: Arch-specific KVM structure representing the guest virtual * machine. * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags); +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb); #define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ - kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0) + __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL) /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. @@ -374,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); * If there is a valid, leaf page-table entry used to translate @addr, then * relax the permissions in that entry according to the read, write and * execute permissions specified by @prot. No permissions are removed, and - * TLB invalidation is performed after updating the entry. + * TLB invalidation is performed after updating the entry. Software bits cannot + * be set or cleared using kvm_pgtable_stage2_relax_perms(). * * Return: 0 on success, negative error code on failure. */ @@ -433,22 +489,42 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); /** - * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical - * Addresses with compatible permission - * attributes. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * @addr: Address that must be covered by the range. - * @prot: Protection attributes that the range must be compatible with. - * @range: Range structure used to limit the search space at call time and - * that will hold the result. + * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry + * with its level. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init() + * or a similar initialiser. + * @addr: Input address for the start of the walk. + * @ptep: Pointer to storage for the retrieved PTE. + * @level: Pointer to storage for the level of the retrieved PTE. + * + * The offset of @addr within a page is ignored. * - * The offset of @addr within a page is ignored. An IPA is compatible with @prot - * iff its corresponding stage-2 page-table entry has default ownership and, if - * valid, is mapped with protection attributes identical to @prot. + * The walker will walk the page-table entries corresponding to the input + * address specified, retrieving the leaf corresponding to this address. + * Invalid entries are treated as leaf entries. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range); +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level); + +/** + * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a + * stage-2 Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte); + +/** + * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1 + * Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte); #endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index f2e06e7c0a31..b268082d67ed 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -784,14 +784,13 @@ #define ID_AA64PFR0_AMU 0x1 #define ID_AA64PFR0_SVE 0x1 #define ID_AA64PFR0_RAS_V1 0x1 +#define ID_AA64PFR0_RAS_V1P1 0x2 #define ID_AA64PFR0_FP_NI 0xf #define ID_AA64PFR0_FP_SUPPORTED 0x0 #define ID_AA64PFR0_ASIMD_NI 0xf #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 -#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2 -#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 +#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1 +#define ID_AA64PFR0_ELx_32BIT_64BIT 0x2 /* id_aa64pfr1 */ #define ID_AA64PFR1_MPAMFRAC_SHIFT 16 @@ -847,6 +846,9 @@ #define ID_AA64MMFR0_ASID_SHIFT 4 #define ID_AA64MMFR0_PARANGE_SHIFT 0 +#define ID_AA64MMFR0_ASID_8 0x0 +#define ID_AA64MMFR0_ASID_16 0x2 + #define ID_AA64MMFR0_TGRAN4_NI 0xf #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX 0x7 @@ -857,9 +859,16 @@ #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN 0x1 #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX 0xf +#define ID_AA64MMFR0_PARANGE_32 0x0 +#define ID_AA64MMFR0_PARANGE_36 0x1 +#define ID_AA64MMFR0_PARANGE_40 0x2 +#define ID_AA64MMFR0_PARANGE_42 0x3 +#define ID_AA64MMFR0_PARANGE_44 0x4 #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ARM64_MIN_PARANGE_BITS 32 + #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 @@ -904,6 +913,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_MTPMU_SHIFT 48 #define ID_AA64DFR0_TRBE_SHIFT 44 #define ID_AA64DFR0_TRACE_FILT_SHIFT 40 #define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 @@ -1034,14 +1044,17 @@ #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN16_2_SHIFT #elif defined(CONFIG_ARM64_64K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN64_2_SHIFT #endif #define MVFR2_FPMISC_SHIFT 4 @@ -1172,6 +1185,11 @@ #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ARM64_FEATURE_FIELD_BITS 4 + +/* Create a mask for the feature bits of the specified feature. */ +#define ARM64_FEATURE_MASK(x) (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT)) + #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b2770d753ba3..f8a3067d10c6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -240,8 +240,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), ARM64_FTR_END, }; @@ -1983,7 +1983,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, #ifdef CONFIG_KVM { @@ -1994,7 +1994,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL1_SHIFT, - .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, { .desc = "Protected KVM", diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 709d2c433c5e..f6b1a88245db 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -181,6 +181,8 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) + HYPERVISOR_DATA_SECTIONS + idmap_pg_dir = .; . += IDMAP_DIR_SIZE; idmap_pg_end = .; @@ -260,8 +262,6 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) - HYPERVISOR_DATA_SECTIONS - /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a4eba0908bfa..d7eec0b43744 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -26,6 +26,7 @@ menuconfig KVM select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_XFER_TO_GUEST_WORK select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD @@ -46,6 +47,15 @@ if KVM source "virt/kvm/Kconfig" +config NVHE_EL2_DEBUG + bool "Debug mode for non-VHE EL2 object" + help + Say Y here to enable the debug mode for the non-VHE KVM EL2 object. + Failure reports will BUG() in the hypervisor. This is intended for + local EL2 hypervisor development. + + If unsure, say N. + endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0ca72f5cda41..fe102cd2e518 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,6 +6,7 @@ #include <linux/bug.h> #include <linux/cpu_pm.h> +#include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> @@ -15,6 +16,7 @@ #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> +#include <linux/kmemleak.h> #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> @@ -42,10 +44,6 @@ #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> -#ifdef REQUIRES_VIRT -__asm__(".arch_extension virt"); -#endif - static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); @@ -575,7 +573,7 @@ static void update_vmid(struct kvm_vmid *vmid) kvm_call_hyp(__kvm_flush_vm_context); } - vmid->vmid = kvm_next_vmid; + WRITE_ONCE(vmid->vmid, kvm_next_vmid); kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; @@ -719,6 +717,45 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) } /** + * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest + * @vcpu: The VCPU pointer + * @ret: Pointer to write optional return code + * + * Returns: true if the VCPU needs to return to a preemptible + interruptible + * and skip guest entry. + * + * This function disambiguates between two different types of exits: exits to a + * preemptible + interruptible kernel context and exits to userspace. For an + * exit to userspace, this function will write the return code to ret and return + * true. For an exit to preemptible + interruptible kernel context (i.e. check + * for pending work and re-enter), return true without writing to ret. + */ +static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) +{ + struct kvm_run *run = vcpu->run; + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + *ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + return true; + } + } + + return kvm_request_pending(vcpu) || + need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || + xfer_to_guest_mode_work_pending(); +} + +/** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer * @@ -761,7 +798,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - cond_resched(); + ret = xfer_to_guest_mode_handle_work(vcpu); + if (!ret) + ret = 1; update_vmid(&vcpu->arch.hw_mmu->vmid); @@ -781,30 +820,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vgic_flush_hwstate(vcpu); /* - * Exit if we have a signal pending so that we can deliver the - * signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - - /* - * If we're using a userspace irqchip, then check if we need - * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - } - - /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and @@ -812,8 +827,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || - kvm_request_pending(vcpu)) { + if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ kvm_pmu_sync_hwstate(vcpu); @@ -1039,7 +1053,7 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned int i, ret; - int phys_target = kvm_target_cpu(); + u32 phys_target = kvm_target_cpu(); if (init->target != phys_target) return -EINVAL; @@ -1108,6 +1122,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); + vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; /* * Handle the "start in power-off" case. @@ -1219,6 +1234,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(®, argp, sizeof(reg))) break; + /* + * We could owe a reset due to PSCI. Handle the pending reset + * here to ensure userspace register accesses are ordered after + * the reset. + */ + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + if (ioctl == KVM_SET_ONE_REG) r = kvm_arm_set_reg(vcpu, ®); else @@ -1700,11 +1723,6 @@ static bool init_psci_relay(void) return true; } -static int init_common_resources(void) -{ - return kvm_set_ipa_limit(); -} - static int init_subsystems(void) { int err = 0; @@ -1958,56 +1976,17 @@ static void _kvm_host_prot_finalize(void *discard) WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); } -static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end); -} - -#define pkvm_mark_hyp_section(__section) \ - pkvm_mark_hyp(__pa_symbol(__section##_start), \ - __pa_symbol(__section##_end)) - static int finalize_hyp_mode(void) { - int cpu, ret; - if (!is_protected_kvm_enabled()) return 0; - ret = pkvm_mark_hyp_section(__hyp_idmap_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_rodata); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_bss); - if (ret) - return ret; - - ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size); - if (ret) - return ret; - - for_each_possible_cpu(cpu) { - phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]); - phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order()); - - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - - start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu)); - end = start + PAGE_SIZE; - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - } + /* + * Exclude HYP BSS from kmemleak so that it doesn't get peeked + * at, which would end badly once the section is inaccessible. + * None of other sections should ever be introspected. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); /* * Flip the static key upfront as that may no longer be possible @@ -2019,11 +1998,6 @@ static int finalize_hyp_mode(void) return 0; } -static void check_kvm_target_cpu(void *ret) -{ - *(int *)ret = kvm_target_cpu(); -} - struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu; @@ -2083,7 +2057,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) int kvm_arch_init(void *opaque) { int err; - int ret, cpu; bool in_hyp_mode; if (!is_hyp_mode_available()) { @@ -2098,15 +2071,7 @@ int kvm_arch_init(void *opaque) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ "Only trusted guests should be used on this system.\n"); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); - if (ret < 0) { - kvm_err("Error, CPU %d not supported!\n", cpu); - return -ENODEV; - } - } - - err = init_common_resources(); + err = kvm_set_ipa_limit(); if (err) return err; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d5e79d7ee6e9..db9361338b2a 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -21,7 +21,7 @@ DBG_MDSCR_KDE | \ DBG_MDSCR_MDE) -static DEFINE_PER_CPU(u32, mdcr_el2); +static DEFINE_PER_CPU(u64, mdcr_el2); /** * save/restore_guest_debug_regs diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1dfb83578277..5ce26bedf23c 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -31,8 +31,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -50,10 +48,9 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -842,7 +839,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, return 0; } -int __attribute_const__ kvm_target_cpu(void) +u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); @@ -874,7 +871,7 @@ int __attribute_const__ kvm_target_cpu(void) int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { - int target = kvm_target_cpu(); + u32 target = kvm_target_cpu(); if (target < 0) return -ENODEV; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 6f48336b1d86..275a27368a04 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -113,34 +113,20 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * - * @return: 0 (while setting vcpu->run->exit_reason), -1 for error + * @return: 0 (while setting vcpu->run->exit_reason) */ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 esr = kvm_vcpu_get_esr(vcpu); - int ret = 0; run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = esr; - switch (ESR_ELx_EC(esr)) { - case ESR_ELx_EC_WATCHPT_LOW: + if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) run->debug.arch.far = vcpu->arch.fault.far_el2; - fallthrough; - case ESR_ELx_EC_SOFTSTP_LOW: - case ESR_ELx_EC_BREAKPT_LOW: - case ESR_ELx_EC_BKPT32: - case ESR_ELx_EC_BRK64: - break; - default: - kvm_err("%s: un-handled case esr: %#08x\n", - __func__, (unsigned int) esr); - ret = -1; - break; - } - return ret; + return 0; } static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) @@ -292,11 +278,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } -void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, +void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, + u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar) { - u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr)); - u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr; + u64 elr_in_kimg = __phys_to_kimg(elr_phys); + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; /* @@ -309,20 +296,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { - struct bug_entry *bug = find_bug(elr_in_kimg); const char *file = NULL; unsigned int line = 0; /* All hyp bugs, including warnings, are treated as fatal. */ - if (bug) - bug_get_file_line(bug, &file, &line); + if (!is_protected_kvm_enabled() || + IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + struct bug_entry *bug = find_bug(elr_in_kimg); + + if (bug) + bug_get_file_line(bug, &file, &line); + } if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset); } else { - kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset); } /* @@ -334,5 +325,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", - spsr, elr, esr, far, hpfar, par, vcpu); + spsr, elr_virt, esr, far, hpfar, par, vcpu); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e4a2f295a394..a0e78a6027be 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); } + + vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } -static inline void __deactivate_traps_common(void) +static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); + write_sysreg(0, hstr_el2); if (kvm_arm_support_pmu_v3()) write_sysreg(0, pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 9c227d87c36d..b58c910babaf 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -12,6 +12,32 @@ #include <asm/virt.h> #include <nvhe/spinlock.h> +/* + * SW bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, + PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, +}; + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return prot & PKVM_PAGE_STATE_PROT_MASK; +} + struct host_kvm { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -20,16 +46,21 @@ struct host_kvm { }; extern struct host_kvm host_kvm; +extern const u8 pkvm_hyp_id; + int __pkvm_prot_finalize(void); -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); +int __pkvm_host_share_hyp(u64 pfn); +bool addr_is_memory(phys_addr_t phys); +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 8ec3a5a7744b..c9a8f535212e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -23,8 +23,7 @@ int hyp_map_vectors(void); int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot); +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 76b537f8d1c6..4652fd04bdbe 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -15,6 +15,7 @@ #include <asm/alternative.h> #include <asm/lse.h> +#include <asm/rwonce.h> typedef union hyp_spinlock { u32 __val; @@ -89,4 +90,28 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock) : "memory"); } +static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) +{ + hyp_spinlock_t lockval = READ_ONCE(*lock); + + return lockval.owner != lockval.next; +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) +{ + /* + * The __pkvm_init() path accesses protected data-structures without + * holding locks as the other CPUs are guaranteed to not enter EL2 + * concurrently at this point in time. The point by which EL2 is + * initialized on all CPUs is reflected in the pkvm static key, so + * wait until it is set before checking the lock state. + */ + if (static_branch_likely(&kvm_protected_mode_initialized)) + BUG_ON(!hyp_spin_is_locked(lock)); +} +#else +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { } +#endif + #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 7d3f25868cae..df361d839902 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 2b23400e0fb3..4b652ffb591d 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -7,6 +7,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> @@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 +#ifdef CONFIG_NVHE_EL2_DEBUG + /* Ensure host stage-2 is disabled */ + mrs x0, hcr_el2 + bic x0, x0, #HCR_VM + msr hcr_el2, x0 + isb + tlbi vmalls12e1 + dsb nsh +#endif + /* Load the panic arguments into x0-7 */ mrs x0, esr_el2 - get_vcpu_ptr x4, x5 - mrs x5, far_el2 - mrs x6, hpfar_el2 - mov x7, xzr // Unused argument + mov x4, x3 + mov x3, x2 + hyp_pa x3, x6 + get_vcpu_ptr x5, x6 + mrs x6, far_el2 + mrs x7, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ cbz x29, __host_enter_without_restoring diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1632f001f4ed..2da6aa8da868 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); } -static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(unsigned long, start, host_ctxt, 1); - DECLARE_REG(unsigned long, size, host_ctxt, 2); - DECLARE_REG(unsigned long, phys, host_ctxt, 3); - DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + DECLARE_REG(u64, pfn, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot); + cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); } static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) @@ -163,14 +160,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } - -static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(phys_addr_t, start, host_ctxt, 1); - DECLARE_REG(phys_addr_t, end, host_ctxt, 2); - - cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end); -} typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -193,10 +182,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_restore_aprs), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_create_mappings), + HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_prot_finalize), - HANDLE_FUNC(__pkvm_mark_hyp), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index a6ce991b1467..bacd493a4eac 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool; u64 id_aa64mmfr0_el1_sys_val; u64 id_aa64mmfr1_el1_sys_val; -static const u8 pkvm_hyp_id = 1; +const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) { @@ -89,6 +89,8 @@ static void prepare_host_vtcr(void) id_aa64mmfr1_el1_sys_val, phys_shift); } +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); + int kvm_host_prepare_stage2(void *pgt_pool_base) { struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; @@ -101,16 +103,17 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) if (ret) return ret; - ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS); + ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, + &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, + host_stage2_force_pte_cb); if (ret) return ret; mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); mmu->arch = &host_kvm.arch; mmu->pgt = &host_kvm.pgt; - mmu->vmid.vmid_gen = 0; - mmu->vmid.vmid = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); + WRITE_ONCE(mmu->vmid.vmid, 0); return 0; } @@ -126,7 +129,7 @@ int __pkvm_prot_finalize(void) kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); /* * Make sure to have an ISB before the TLB maintenance below but only @@ -159,6 +162,11 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +struct kvm_mem_range { + u64 start; + u64 end; +}; + static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; @@ -189,16 +197,26 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) return false; } +bool addr_is_memory(phys_addr_t phys) +{ + struct kvm_mem_range range; + + return find_mem_range(phys, &range); +} + +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + static bool range_is_memory(u64 start, u64 end) { - struct kvm_mem_range r1, r2; + struct kvm_mem_range r; - if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2)) - return false; - if (r1.start != r2.start) + if (!find_mem_range(start, &r)) return false; - return true; + return is_in_mem_range(end - 1, &r); } static inline int __host_stage2_idmap(u64 start, u64 end, @@ -208,60 +226,208 @@ static inline int __host_stage2_idmap(u64 start, u64 end, prot, &host_s2_pool); } +/* + * The pool has been provided with enough pages to cover all of memory with + * page granularity, but it is difficult to know how much of the MMIO range + * we will need to cover upfront, so we may need to 'recycle' the pages if we + * run out. + */ +#define host_stage2_try(fn, ...) \ + ({ \ + int __ret; \ + hyp_assert_lock_held(&host_kvm.lock); \ + __ret = fn(__VA_ARGS__); \ + if (__ret == -ENOMEM) { \ + __ret = host_stage2_unmap_dev_all(); \ + if (!__ret) \ + __ret = fn(__VA_ARGS__); \ + } \ + __ret; \ + }) + +static inline bool range_included(struct kvm_mem_range *child, + struct kvm_mem_range *parent) +{ + return parent->start <= child->start && child->end <= parent->end; +} + +static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) +{ + struct kvm_mem_range cur; + kvm_pte_t pte; + u32 level; + int ret; + + hyp_assert_lock_held(&host_kvm.lock); + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + if (ret) + return ret; + + if (kvm_pte_valid(pte)) + return -EAGAIN; + + if (pte) + return -EPERM; + + do { + u64 granule = kvm_granule_size(level); + cur.start = ALIGN_DOWN(addr, granule); + cur.end = cur.start + granule; + level++; + } while ((level < KVM_PGTABLE_MAX_LEVELS) && + !(kvm_level_supports_block_mapping(level) && + range_included(&cur, range))); + + *range = cur; + + return 0; +} + +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, + enum kvm_pgtable_prot prot) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + addr, size, &host_s2_pool, owner_id); +} + +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +{ + /* + * Block mappings must be used with care in the host stage-2 as a + * kvm_pgtable_stage2_map() operation targeting a page in the range of + * an existing block will delete the block under the assumption that + * mappings in the rest of the block range can always be rebuilt lazily. + * That assumption is correct for the host stage-2 with RWX mappings + * targeting memory or RW mappings targeting MMIO ranges (see + * host_stage2_idmap() below which implements some of the host memory + * abort logic). However, this is not safe for any other mappings where + * the host stage-2 page-table is in fact the only place where this + * state is stored. In all those cases, it is safer to use page-level + * mappings, hence avoiding to lose the state because of side-effects in + * kvm_pgtable_stage2_map(). + */ + if (range_is_memory(addr, end)) + return prot != PKVM_HOST_MEM_PROT; + else + return prot != PKVM_HOST_MMIO_PROT; +} + static int host_stage2_idmap(u64 addr) { - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; struct kvm_mem_range range; bool is_memory = find_mem_range(addr, &range); + enum kvm_pgtable_prot prot; int ret; - if (is_memory) - prot |= KVM_PGTABLE_PROT_X; + prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range); + ret = host_stage2_adjust_range(addr, &range); if (ret) goto unlock; - ret = __host_stage2_idmap(range.start, range.end, prot); - if (ret != -ENOMEM) + ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); +unlock: + hyp_spin_unlock(&host_kvm.lock); + + return ret; +} + +static inline bool check_prot(enum kvm_pgtable_prot prot, + enum kvm_pgtable_prot required, + enum kvm_pgtable_prot denied) +{ + return (prot & (required | denied)) == required; +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + phys_addr_t addr = hyp_pfn_to_phys(pfn); + enum kvm_pgtable_prot prot, cur; + void *virt = __hyp_va(addr); + enum pkvm_page_state state; + kvm_pte_t pte; + int ret; + + if (!addr_is_memory(addr)) + return -EINVAL; + + hyp_spin_lock(&host_kvm.lock); + hyp_spin_lock(&pkvm_pgd_lock); + + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); + if (ret) goto unlock; + if (!pte) + goto map_shared; /* - * The pool has been provided with enough pages to cover all of memory - * with page granularity, but it is difficult to know how much of the - * MMIO range we will need to cover upfront, so we may need to 'recycle' - * the pages if we run out. + * Check attributes in the host stage-2 PTE. We need the page to be: + * - mapped RWX as we're sharing memory; + * - not borrowed, as that implies absence of ownership. + * Otherwise, we can't let it got through */ - ret = host_stage2_unmap_dev_all(); - if (ret) + cur = kvm_pgtable_stage2_pte_prot(pte); + prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { + ret = -EPERM; goto unlock; + } - ret = __host_stage2_idmap(range.start, range.end, prot); + state = pkvm_getstate(cur); + if (state == PKVM_PAGE_OWNED) + goto map_shared; -unlock: - hyp_spin_unlock(&host_kvm.lock); + /* + * Tolerate double-sharing the same page, but this requires + * cross-checking the hypervisor stage-1. + */ + if (state != PKVM_PAGE_SHARED_OWNED) { + ret = -EPERM; + goto unlock; + } - return ret; -} + ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); + if (ret) + goto unlock; -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - int ret; + /* + * If the page has been shared with the hypervisor, it must be + * already mapped as SHARED_BORROWED in its stage-1. + */ + cur = kvm_pgtable_hyp_pte_prot(pte); + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, prot, ~prot)) + ret = -EPERM; + goto unlock; +map_shared: /* - * host_stage2_unmap_dev_all() currently relies on MMIO mappings being - * non-persistent, so don't allow changing page ownership in MMIO range. + * If the page is not yet shared, adjust mappings in both page-tables + * while both locks are held. */ - if (!range_is_memory(start, end)) - return -EINVAL; + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); + BUG_ON(ret); - hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start, - &host_s2_pool, pkvm_hyp_id); + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); + BUG_ON(ret); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); hyp_spin_unlock(&host_kvm.lock); - return ret != -EAGAIN ? ret : 0; + return ret; } void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index a8efdf0f9003..2fabeceb889a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -23,8 +23,8 @@ u64 __io_map_base; struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; unsigned int hyp_memblock_nr; -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot) +static int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { int err; @@ -67,13 +67,15 @@ out: return addr; } -int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) { unsigned long start = (unsigned long)from; unsigned long end = (unsigned long)to; unsigned long virt_addr; phys_addr_t phys; + hyp_assert_lock_held(&pkvm_pgd_lock); + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) int err; phys = hyp_virt_to_phys((void *)virt_addr); - err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE, + phys, prot); if (err) return err; } @@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + ret = pkvm_create_mappings_locked(from, to, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +} + int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) { unsigned long start, end; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0b574d106519..57c27846320f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; + enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); if (ret) return ret; @@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + /* + * Map the host's .bss and .rodata sections RO in the hypervisor, but + * transfer the ownership from the host to the hypervisor itself to + * make sure it can't be donated or shared with another entity. + * + * The ownership transition requires matching changes in the host + * stage-2. This will be done later (see finalize_host_mappings()) once + * the hyp_vmemmap is addressable. + */ + prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); + ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); + if (ret) + return ret; + return 0; } @@ -148,6 +159,57 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } +static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + enum kvm_pgtable_prot prot; + enum pkvm_page_state state; + kvm_pte_t pte = *ptep; + phys_addr_t phys; + + if (!kvm_pte_valid(pte)) + return 0; + + if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + phys = kvm_pte_to_phys(pte); + if (!addr_is_memory(phys)) + return 0; + + /* + * Adjust the host stage-2 mappings to match the ownership attributes + * configured in the hypervisor stage-1. + */ + state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + switch (state) { + case PKVM_PAGE_OWNED: + return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + case PKVM_PAGE_SHARED_OWNED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + break; + case PKVM_PAGE_SHARED_BORROWED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + break; + default: + return -EINVAL; + } + + return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); +} + +static int finalize_host_mappings(void) +{ + struct kvm_pgtable_walker walker = { + .cb = finalize_host_mappings_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker); +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = finalize_host_mappings(); + if (ret) + goto out; + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_page = hyp_zalloc_hyp_page, .phys_to_virt = hyp_phys_to_virt, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f7af9688c1f7..a34b01cc8ab9 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); __activate_traps_common(vcpu); - val = CPTR_EL2_DEFAULT; + val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; if (!update_fp_enabled(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; @@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 mdcr_el2, cptr; + u64 cptr; ___deactivate_traps(vcpu); - mdcr_el2 = read_sysreg(mdcr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; @@ -92,13 +90,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } - __deactivate_traps_common(); - - mdcr_el2 &= MDCR_EL2_HPMN_MASK; - mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; - mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; + __deactivate_traps_common(vcpu); - write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); cptr = CPTR_EL2_DEFAULT; @@ -170,6 +163,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; + struct kvm_s2_mmu *mmu; bool pmu_switch_needed; u64 exit_code; @@ -213,7 +207,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); + mmu = kern_hyp_va(vcpu->arch.hw_mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 38ed0f6f2703..d296d617f589 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -34,12 +34,12 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, } /* - * __load_guest_stage2() includes an ISB only when the AT + * __load_stage2() includes an ISB only when the AT * workaround is applied. Take care of the opposite condition, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_guest_stage2(mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 05321f4165e3..f8ceebe4982e 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,16 +11,12 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> -#define KVM_PTE_VALID BIT(0) #define KVM_PTE_TYPE BIT(1) #define KVM_PTE_TYPE_BLOCK 0 #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) -#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) - #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) @@ -40,6 +36,8 @@ #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) @@ -48,9 +46,7 @@ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 struct kvm_pgtable_walk_data { @@ -61,17 +57,6 @@ struct kvm_pgtable_walk_data { u64 end; }; -static u64 kvm_granule_shift(u32 level) -{ - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ - return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); -} - -static u64 kvm_granule_size(u32 level) -{ - return BIT(kvm_granule_shift(level)); -} - #define KVM_PHYS_INVALID (-1ULL) static bool kvm_phys_is_valid(u64 phys) @@ -79,15 +64,6 @@ static bool kvm_phys_is_valid(u64 phys) return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); } -static bool kvm_level_supports_block_mapping(u32 level) -{ - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); -} - static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) { u64 granule = kvm_granule_size(level); @@ -135,11 +111,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_valid(kvm_pte_t pte) -{ - return pte & KVM_PTE_VALID; -} - static bool kvm_pte_table(kvm_pte_t pte, u32 level) { if (level == KVM_PGTABLE_MAX_LEVELS - 1) @@ -151,16 +122,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level) return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } -static u64 kvm_pte_to_phys(kvm_pte_t pte) -{ - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; - - return pa; -} - static kvm_pte_t kvm_phys_to_pte(u64 pa) { kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; @@ -326,6 +287,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, return _kvm_pgtable_walk(&walk_data); } +struct leaf_walk_data { + kvm_pte_t pte; + u32 level; +}; + +static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct leaf_walk_data *data = arg; + + data->pte = *ptep; + data->level = level; + + return 0; +} + +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level) +{ + struct leaf_walk_data data; + struct kvm_pgtable_walker walker = { + .cb = leaf_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &data, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), + PAGE_SIZE, &walker); + if (!ret) { + if (ptep) + *ptep = data.pte; + if (level) + *level = data.level; + } + + return ret; +} + struct hyp_map_data { u64 phys; kvm_pte_t attr; @@ -357,11 +357,47 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + u32 ap; + + if (!kvm_pte_valid(pte)) + return prot; + + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_X; + + ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); + if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) + prot |= KVM_PGTABLE_PROT_R; + else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) + prot |= KVM_PGTABLE_PROT_RW; + + return prot; +} + +static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +{ + /* + * Tolerate KVM recreating the exact same mapping, or changing software + * bits if the existing mapping was valid. + */ + if (old == new) + return false; + + if (!kvm_pte_valid(old)) + return true; + + return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW); +} + static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { @@ -371,9 +407,8 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; - /* Tolerate KVM recreating the exact same mapping */ new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (old != new && !WARN_ON(kvm_pte_valid(old))) + if (hyp_pte_needs_update(old, new)) smp_store_release(ptep, new); data->phys += granule; @@ -438,6 +473,8 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; pgt->mm_ops = mm_ops; pgt->mmu = NULL; + pgt->force_pte_cb = NULL; + return 0; } @@ -475,6 +512,9 @@ struct stage2_map_data { void *memcache; struct kvm_pgtable_mm_ops *mm_ops; + + /* Force mappings to page granularity */ + bool force_pte; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -539,11 +579,29 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + + if (!kvm_pte_valid(pte)) + return prot; + + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) + prot |= KVM_PGTABLE_PROT_R; + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) + prot |= KVM_PGTABLE_PROT_W; + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) + prot |= KVM_PGTABLE_PROT_X; + + return prot; +} + static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) @@ -588,6 +646,15 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } +static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, + struct stage2_map_data *data) +{ + if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1))) + return false; + + return kvm_block_mapping_supported(addr, end, data->phys, level); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -597,7 +664,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!kvm_block_mapping_supported(addr, end, phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return -E2BIG; if (kvm_phys_is_valid(phys)) @@ -641,7 +708,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, if (data->anchor) return 0; - if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return 0; data->childp = kvm_pte_follow(*ptep, data->mm_ops); @@ -771,6 +838,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .mmu = pgt->mmu, .memcache = mc, .mm_ops = pgt->mm_ops, + .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -802,6 +870,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .memcache = mc, .mm_ops = pgt->mm_ops, .owner_id = owner_id, + .force_pte = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -995,6 +1064,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, u32 level; kvm_pte_t set = 0, clr = 0; + if (prot & KVM_PTE_LEAF_ATTR_HI_SW) + return -EINVAL; + if (prot & KVM_PGTABLE_PROT_R) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -1043,9 +1115,11 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags) + +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; u64 vtcr = arch->vtcr; @@ -1063,6 +1137,7 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch pgt->mm_ops = mm_ops; pgt->mmu = &arch->mmu; pgt->flags = flags; + pgt->force_pte_cb = force_pte_cb; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); @@ -1102,77 +1177,3 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); pgt->pgd = NULL; } - -#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \ - KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \ - KVM_PTE_LEAF_ATTR_S2_IGNORED) - -static int stage2_check_permission_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) -{ - kvm_pte_t old_attr, pte = *ptep, *new_attr = arg; - - /* - * Compatible mappings are either invalid and owned by the page-table - * owner (whose id is 0), or valid with matching permission attributes. - */ - if (kvm_pte_valid(pte)) { - old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK; - if (old_attr != *new_attr) - return -EEXIST; - } else if (pte) { - return -EEXIST; - } - - return 0; -} - -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range) -{ - kvm_pte_t attr; - struct kvm_pgtable_walker check_perm_walker = { - .cb = stage2_check_permission_walker, - .flags = KVM_PGTABLE_WALK_LEAF, - .arg = &attr, - }; - u64 granule, start, end; - u32 level; - int ret; - - ret = stage2_set_prot_attr(pgt, prot, &attr); - if (ret) - return ret; - attr &= KVM_PTE_LEAF_S2_COMPAT_MASK; - - for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) { - granule = kvm_granule_size(level); - start = ALIGN_DOWN(addr, granule); - end = start + granule; - - if (!kvm_level_supports_block_mapping(level)) - continue; - - if (start < range->start || range->end < end) - continue; - - /* - * Check the presence of existing mappings with incompatible - * permissions within the current block range, and try one level - * deeper if one is found. - */ - ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker); - if (ret != -EEXIST) - break; - } - - if (!ret) { - range->start = start; - range->end = end; - } - - return ret; -} diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index f1e2e5a00933..289689b2682d 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b3229924d243..ded2c66675f0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -91,17 +91,9 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); } -void deactivate_traps_vhe_put(void) +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { - u64 mdcr_el2 = read_sysreg(mdcr_el2); - - mdcr_el2 &= MDCR_EL2_HPMN_MASK | - MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | - MDCR_EL2_TPMS; - - write_sysreg(mdcr_el2, mdcr_el2); - - __deactivate_traps_common(); + __deactivate_traps_common(vcpu); } /* Switch to the guest for VHE systems running in EL2 */ @@ -124,11 +116,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * * We have already configured the guest's stage 1 translation in * kvm_vcpu_load_sysregs_vhe above. We must now call - * __load_guest_stage2 before __activate_traps, because - * __load_guest_stage2 configures stage 2 translation, and + * __load_stage2 before __activate_traps, because + * __load_stage2 configures stage 2 translation, and * __activate_traps clear HCR_EL2.TGE (among other things). */ - __load_guest_stage2(vcpu->arch.hw_mmu); + __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); __activate_traps(vcpu); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 2a0b8c88d74f..007a12dd4351 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -101,7 +101,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *host_ctxt; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - deactivate_traps_vhe_put(); + deactivate_traps_vhe_put(vcpu); __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 66f17349f0c3..24cef9b87f9e 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -50,10 +50,10 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * * ARM erratum 1165522 requires some special handling (again), * as we need to make sure both stages of translation are in - * place before clearing TGE. __load_guest_stage2() already + * place before clearing TGE. __load_stage2() already * has an ISB in order to deal with this. */ - __load_guest_stage2(mmu); + __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..1a94a7ca48f2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -80,6 +80,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ void kvm_flush_remote_tlbs(struct kvm *kvm) { + ++kvm->stat.generic.remote_tlb_flush_requests; kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } @@ -259,10 +260,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size, { int err; - if (!kvm_host_owns_hyp_mappings()) { - return kvm_call_hyp_nvhe(__pkvm_create_mappings, - start, size, phys, prot); - } + if (WARN_ON(!kvm_host_owns_hyp_mappings())) + return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); @@ -282,6 +281,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) } } +static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end) +{ + phys_addr_t addr; + int ret; + + for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) { + ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, + __phys_to_pfn(addr)); + if (ret) + return ret; + } + + return 0; +} + /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range @@ -302,6 +316,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) if (is_kernel_in_hyp_mode()) return 0; + if (!kvm_host_owns_hyp_mappings()) { + if (WARN_ON(prot != PAGE_HYP)) + return -EPERM; + return pkvm_share_hyp(kvm_kaddr_to_phys(from), + kvm_kaddr_to_phys(to)); + } + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -433,6 +454,32 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, return 0; } +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { + /* We shouldn't need any other callback to walk the PT */ + .phys_to_virt = kvm_host_va, +}; + +static int get_user_mapping_size(struct kvm *kvm, u64 addr) +{ + struct kvm_pgtable pgt = { + .pgd = (kvm_pte_t *)kvm->mm->pgd, + .ia_bits = VA_BITS, + .start_level = (KVM_PGTABLE_MAX_LEVELS - + CONFIG_PGTABLE_LEVELS), + .mm_ops = &kvm_user_mm_ops, + }; + kvm_pte_t pte = 0; /* Keep GCC quiet... */ + u32 level = ~0; + int ret; + + ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); + VM_BUG_ON(ret); + VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); + VM_BUG_ON(!(pte & PTE_VALID)); + + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); +} + static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_host_zalloc_pages_exact, @@ -485,7 +532,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) mmu->arch = &kvm->arch; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); - mmu->vmid.vmid_gen = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); return 0; out_destroy_pgtable: @@ -780,7 +827,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * Returns the size of the mapping. */ static unsigned long -transparent_hugepage_adjust(struct kvm_memory_slot *memslot, +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) { @@ -791,8 +838,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (kvm_is_transparent_hugepage(pfn) && - fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && + get_user_mapping_size(kvm, hva) >= PMD_SIZE) { /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -814,7 +861,7 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, *ipap &= PMD_MASK; kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - kvm_get_pfn(pfn); + get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; @@ -1050,9 +1097,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) - vma_pagesize = transparent_hugepage_adjust(memslot, hva, - &pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) + vma_pagesize = fault_granule; + else + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + hva, &pfn, + &fault_ipa); + } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new VM_SHARED VMA */ diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index 151c31fb9860..f9bb3b14130e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,7 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled()) + if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) static_branch_enable(&kvm_arm_pmu_available); return perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f33825c995cb..f5065f23b413 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); } return reg; @@ -564,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); int i; if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } else { - kvm_pmu_disable_counter_mask(vcpu, mask); + kvm_pmu_disable_counter_mask(vcpu, + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_counter_value(vcpu, i, 0); @@ -745,7 +745,7 @@ int kvm_pmu_probe_pmuver(void) struct perf_event_attr attr = { }; struct perf_event *event; struct arm_pmu *pmu; - int pmuver = 0xf; + int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF; /* * Create a dummy event that only counts user cycles. As we'll never @@ -770,7 +770,7 @@ int kvm_pmu_probe_pmuver(void) if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); - return 0xf; + return ID_AA64DFR0_PMUVER_IMP_DEF; } if (event->pmu) { @@ -923,7 +923,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!vcpu->kvm->arch.pmuver) vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); - if (vcpu->kvm->arch.pmuver == 0xf) + if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF) return -ENODEV; switch (attr->attr) { diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index db4056ecccfd..74c47d420253 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -59,6 +59,12 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } +static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, + unsigned long affinity) +{ + return !(affinity & ~MPIDR_HWID_BITMASK); +} + static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct vcpu_reset_state *reset_state; @@ -66,9 +72,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) struct kvm_vcpu *vcpu = NULL; unsigned long cpu_id; - cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; - if (vcpu_mode_is_32bit(source_vcpu)) - cpu_id &= ~((u32) 0); + cpu_id = smccc_get_arg1(source_vcpu); + if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) + return PSCI_RET_INVALID_PARAMS; vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); @@ -126,6 +132,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) target_affinity = smccc_get_arg1(vcpu); lowest_affinity_level = smccc_get_arg2(vcpu); + if (!kvm_psci_valid_affinity(vcpu, target_affinity)) + return PSCI_RET_INVALID_PARAMS; + /* Determine target affinity mask */ target_affinity_mask = psci_affinity_mask(lowest_affinity_level); if (!target_affinity_mask) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index cba7872d69a8..5ce36b0a3343 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -210,10 +210,16 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { + struct vcpu_reset_state reset_state; int ret; bool loaded; u32 pstate; + mutex_lock(&vcpu->kvm->lock); + reset_state = vcpu->arch.reset_state; + WRITE_ONCE(vcpu->arch.reset_state.reset, false); + mutex_unlock(&vcpu->kvm->lock); + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -276,8 +282,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) * Additional reset state handling that PSCI may have imposed on us. * Must be done after all the sys_reg reset. */ - if (vcpu->arch.reset_state.reset) { - unsigned long target_pc = vcpu->arch.reset_state.pc; + if (reset_state.reset) { + unsigned long target_pc = reset_state.pc; /* Gracefully handle Thumb2 entry point */ if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { @@ -286,13 +292,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } /* Propagate caller endianness */ - if (vcpu->arch.reset_state.be) + if (reset_state.be) kvm_vcpu_set_be(vcpu); *vcpu_pc(vcpu) = target_pc; - vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0); - - vcpu->arch.reset_state.reset = false; + vcpu_set_reg(vcpu, 0, reset_state.r0); } /* Reset timer */ @@ -311,31 +315,26 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int parange, tgran_2; + unsigned int parange; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + /* + * IPA size beyond 48 bits could not be supported + * on either 4K or 16K page size. Hence let's cap + * it to 48 bits, in case it's reported as larger + * on the system. + */ + if (PAGE_SIZE != SZ_64K) + parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48); /* * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at * Stage-2. If not, things will stop very quickly. */ - switch (PAGE_SIZE) { - default: - case SZ_4K: - tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT; - break; - case SZ_16K: - tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT; - break; - case SZ_64K: - tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT; - break; - } - - switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { + switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) { case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; @@ -369,7 +368,7 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (phys_shift) { if (phys_shift > kvm_ipa_limit || - phys_shift < 32) + phys_shift < ARM64_MIN_PARANGE_BITS) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f6f126eb6ac1..1d46e185f31e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -44,10 +44,6 @@ * 64bit interface. */ -#define reg_to_encoding(x) \ - sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ - (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) - static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) @@ -318,14 +314,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, /* * We want to avoid world-switching all the DBG registers all the * time: - * + * * - If we've touched any debug register, it is likely that we're * going to touch more of them. It then makes sense to disable the * traps and start doing the save/restore dance * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is * then mandatory to save/restore the registers, as the guest * depends on them. - * + * * For this, we use a DIRTY bit, indicating the guest has modified the * debug registers, used as follow: * @@ -603,6 +599,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); + + /* No PMU available, any PMU reg may UNDEF... */ + if (!kvm_arm_support_pmu_v3()) + return; + + n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; + n &= ARMV8_PMU_PMCR_N_MASK; + if (n) + mask |= GENMASK(n - 1, 0); + + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= mask; +} + +static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); +} + +static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; +} + +static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; +} + static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr, val; @@ -845,7 +876,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, kvm_pmu_disable_counter_mask(vcpu, val); } } else { - p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } return true; @@ -869,7 +900,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMINTENCLR_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; } else { - p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } return true; @@ -891,7 +922,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMOVSCLR_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); } else { - p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } return true; @@ -944,16 +975,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } #define PMU_SYS_REG(r) \ - SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility + SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ + .reset = reset_pmevcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \ + .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1026,8 +1059,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) - /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) @@ -1038,40 +1069,40 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, switch (id) { case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) - val &= ~FEATURE(ID_AA64PFR0_SVE); - val &= ~FEATURE(ID_AA64PFR0_AMU); - val &= ~FEATURE(ID_AA64PFR0_CSV2); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~FEATURE(ID_AA64PFR0_CSV3); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); break; case SYS_ID_AA64PFR1_EL1: - val &= ~FEATURE(ID_AA64PFR1_MTE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); if (kvm_has_mte(vcpu->kvm)) { u64 pfr, mte; pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); } break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(FEATURE(ID_AA64ISAR1_APA) | - FEATURE(ID_AA64ISAR1_API) | - FEATURE(ID_AA64ISAR1_GPA) | - FEATURE(ID_AA64ISAR1_GPI)); + val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); break; case SYS_ID_AA64DFR0_EL1: /* Limit debug to ARMv8.0 */ - val &= ~FEATURE(ID_AA64DFR0_DEBUGVER); - val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6); /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, ID_AA64DFR0_PMUVER_SHIFT, kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); /* Hide SPE from guests */ - val &= ~FEATURE(ID_AA64DFR0_PMSVER); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER); break; case SYS_ID_DFR0_EL1: /* Limit guests to PMUv3 for ARMv8.4 */ @@ -1249,6 +1280,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, uaddr, true); } +static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + int err; + u64 val; + + /* Perform the access even if we are going to ignore the value */ + err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); + if (err) + return err; + + return 0; +} + static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1592,16 +1637,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(SYS_PMOVSCLR_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0 }, + /* + * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was + * previously (and pointlessly) advertised in the past... + */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .access = access_pmswinc, .reg = PMSWINC_EL0 }, + .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), - .access = access_pmselr, .reg = PMSELR_EL0 }, + .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(SYS_PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), @@ -2106,23 +2156,6 @@ static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, return 0; } -static int match_sys_reg(const void *key, const void *elt) -{ - const unsigned long pval = (unsigned long)key; - const struct sys_reg_desc *r = elt; - - return pval - reg_to_encoding(r); -} - -static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) -{ - unsigned long pval = reg_to_encoding(params); - - return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); -} - int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); @@ -2365,13 +2398,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); - params.Op0 = (esr >> 20) & 3; - params.Op1 = (esr >> 14) & 0x7; - params.CRn = (esr >> 10) & 0xf; - params.CRm = (esr >> 1) & 0xf; - params.Op2 = (esr >> 17) & 0x7; + params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - params.is_write = !(esr & 1); ret = emulate_sys_reg(vcpu, ¶ms); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 9d0621417c2a..cc0cc95a0280 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -11,6 +11,12 @@ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ +#include <linux/bsearch.h> + +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) + struct sys_reg_params { u8 Op0; u8 Op1; @@ -21,6 +27,14 @@ struct sys_reg_params { bool is_write; }; +#define esr_sys64_to_params(esr) \ + ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ + .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; @@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1, return i1->Op2 - i2->Op2; } +static inline int match_sys_reg(const void *key, const void *elt) +{ + const unsigned long pval = (unsigned long)key; + const struct sys_reg_desc *r = elt; + + return pval - reg_to_encoding(r); +} + +static inline const struct sys_reg_desc * +find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], + unsigned int num) +{ + unsigned long pval = reg_to_encoding(params); + + return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); +} + const struct sys_reg_desc *find_reg_by_id(u64 id, struct sys_reg_params *params, const struct sys_reg_desc table[], diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 8d78acc4fba7..064a58c19f48 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -78,13 +78,17 @@ TRACE_EVENT(kvm_arm_clear_debug, TP_printk("flags: 0x%08x", __entry->guest_debug) ); +/* + * The dreg32 name is a leftover from a distant past. This will really + * output a 64bit value... + */ TRACE_EVENT(kvm_arm_set_dreg32, - TP_PROTO(const char *name, __u32 value), + TP_PROTO(const char *name, __u64 value), TP_ARGS(name, value), TP_STRUCT__entry( __field(const char *, name) - __field(__u32, value) + __field(__u64, value) ), TP_fast_assign( @@ -92,7 +96,7 @@ TRACE_EVENT(kvm_arm_set_dreg32, __entry->value = value; ), - TP_printk("%s: 0x%08x", __entry->name, __entry->value) + TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); TRACE_DEFINE_SIZEOF(__u64); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index a016f07adc28..5f9014ae595b 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. @@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 2c580204f1dc..95a18cec14a3 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) u32 val = cpuif->vgic_lr[lr]; u32 cpuid, intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; + bool deactivated; /* Extract the source vCPU id from the LR */ cpuid = val & GICH_LR_PHYSID_CPUID; @@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); if (irq->active && vgic_irq_is_sgi(intid)) @@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & GICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 66004f61cd83..21a6207fb2ee 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; + bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; @@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) @@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & ICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 111bff47e471..5dad4996cfb2 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid >= VGIC_MIN_LPI) return vgic_get_lpi(kvm, intid); - WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; } @@ -1022,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) return map_is_active; } + +/* + * Level-triggered mapped IRQs are special because we only observe rising + * edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample the physical + * line and set the line level, because the device state could have changed + * or we simply need to process the still pending interrupt later. + * + * We could also have entered the guest with the interrupt active+pending. + * On the next exit, we need to re-evaluate the pending state, as it could + * otherwise result in a spurious interrupt by injecting a now potentially + * stale pending state. + * + * If this causes us to lower the level, we have to also clear the physical + * active state, since we will otherwise never be told when the interrupt + * becomes asserted again. + * + * Another case is when the interrupt requires a helping hand on + * deactivation (no HW deactivation, for example). + */ +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending) +{ + if (vgic_irq_is_mapped_level(irq)) { + bool resample = false; + + if (unlikely(vgic_irq_needs_resampling(irq))) { + resample = !(irq->active || irq->pending_latch); + } else if (lr_pending || (lr_deactivated && irq->line_level)) { + irq->line_level = vgic_get_phys_line_level(irq); + resample = !irq->line_level; + } + + if (resample) + vgic_irq_set_phys_active(irq, false); + } +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index dc1f3d1657ee..14a9218641f5 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags); void vgic_kick_vcpus(struct kvm *kvm); +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending); int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index af9dd029a4e1..75c6f264c626 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits), #endif }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 43cad10b877d..4adca5abbc72 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -388,7 +388,6 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, u32 compare, u32 cause) { u32 start_count, after_count; - ktime_t freeze_time; unsigned long flags; /* @@ -396,7 +395,7 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, * this with interrupts disabled to avoid latency. */ local_irq_save(flags); - freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count); + kvm_mips_freeze_hrtimer(vcpu, &start_count); write_c0_gtoffset(start_count - read_c0_count()); local_irq_restore(flags); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a779f7849cfb..080a7feb7731 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -103,7 +103,6 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; - u64 halt_wait_ns; u64 halt_successful_wait; u64 dbell_exits; u64 gdbell_exits; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 79833f78d1da..b785f6772391 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -71,7 +69,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), @@ -88,8 +85,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..6365087f3160 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 636c6ae0939b..870b7f0c7ea5 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots_raw(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bb0dacf7cbec..2acb1c96cfaf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4202,19 +4202,31 @@ out: /* Attribute wait time */ if (do_sleep) { - vc->runner->stat.halt_wait_ns += + vc->runner->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(start_wait); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(start_wait)); /* Attribute failed poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_fail_ns += ktime_to_ns(start_wait) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_fail_hist, + ktime_to_ns(start_wait) - + ktime_to_ns(start_poll)); + } } else { /* Attribute successful poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_success_ns += ktime_to_ns(cur) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_success_hist, + ktime_to_ns(cur) - ktime_to_ns(start_poll)); + } } /* Adjust poll time */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 551b30d84aee..977801c83aff 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -69,7 +67,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), @@ -79,8 +76,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d681ae462350..a604d51acfc8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -244,6 +244,7 @@ struct kvm_s390_sie_block { __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 +#define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 __u8 ecb; /* 0x0061 */ @@ -955,6 +956,7 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d548d60caed2..16256e17a544 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); if (deliverable_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index efda0615741f..752a0ffab9bf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_service_signal), STATS_DESC_COUNTER(VM, inject_virtio) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), STATS_DESC_COUNTER(VCPU, pfault_sync) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -1953,7 +1949,7 @@ out: static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (gfn >= memslots[slot].base_gfn && @@ -1974,7 +1970,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) if (gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); } return start; @@ -3224,6 +3220,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_SRSI; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; @@ -4068,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9fad25109b0d..ecd741ee3276 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4002a24bc43a..acda4b6fc851 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -510,6 +510,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= ECB_TE; } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; /* branch prediction */ if (test_kvm_facility(vcpu->kvm, 82)) scb_s->fpf |= scb_o->fpf & FPF_BPBC; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index a12a4987154e..cefe1d81e2e8 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) -KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af6ce8d4c86a..f8f48a7ec577 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -37,9 +37,21 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS -#define KVM_MAX_VCPUS 288 -#define KVM_SOFT_MAX_VCPUS 240 -#define KVM_MAX_VCPU_ID 1023 +#define KVM_MAX_VCPUS 1024 +#define KVM_SOFT_MAX_VCPUS 710 + +/* + * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs + * might be larger than the actual number of VCPUs because the + * APIC ID encodes CPU topology information. + * + * In the worst case, we'll need less than one extra bit for the + * Core ID, and less than one extra bit for the Package (Die) ID, + * so ratio of 4 should be enough. + */ +#define KVM_VCPU_ID_RATIO 4 +#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) + /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -124,13 +136,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 @@ -229,7 +234,8 @@ enum x86_intercept_stage; KVM_GUESTDBG_USE_HW_BP | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_INJECT_BP | \ - KVM_GUESTDBG_INJECT_DB) + KVM_GUESTDBG_INJECT_DB | \ + KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_BIT 0 @@ -447,6 +453,7 @@ struct kvm_mmu { u64 *pae_root; u64 *pml4_root; + u64 *pml5_root; /* * check zero bits on shadow page table entries, these @@ -482,6 +489,7 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; + bool is_paused; }; struct kvm_pmu { @@ -522,7 +530,6 @@ struct kvm_pmu_ops; enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, - KVM_DEBUGREG_RELOAD = 4, }; struct kvm_mtrr_range { @@ -723,7 +730,6 @@ struct kvm_vcpu_arch { u64 reserved_gpa_bits; int maxphyaddr; - int max_tdp_level; /* emulate context */ @@ -988,6 +994,12 @@ struct kvm_hv { /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; + /* + * How many SynICs use 'AutoEOI' feature + * (protected by arch.apicv_update_lock) + */ + unsigned int synic_auto_eoi_used; + struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; }; @@ -1002,9 +1014,8 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { bool long_mode; - bool shinfo_set; u8 upcall_vector; - struct gfn_to_hva_cache shinfo_cache; + gfn_t shinfo_gfn; }; enum kvm_irqchip_mode { @@ -1061,6 +1072,9 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; + /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ + struct mutex apicv_update_lock; + bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; @@ -1213,9 +1227,17 @@ struct kvm_vm_stat { u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; - u64 lpages; + union { + struct { + atomic64_t pages_4k; + atomic64_t pages_2m; + atomic64_t pages_1g; + }; + atomic64_t pages[KVM_NR_PAGE_SIZES]; + }; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; + u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { @@ -1359,7 +1381,6 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(ulong bit); - void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1543,12 +1564,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); @@ -1744,6 +1765,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, unsigned long bit); +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, + unsigned long bit); + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, @@ -1754,8 +1778,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { @@ -1779,11 +1803,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline u32 get_rdx_init_val(void) -{ - return 0x600; /* P6 family */ -} - static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); @@ -1816,31 +1835,6 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void kvm_spurious_fault(void); - -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Usually after catching the fault we just panic; during reboot - * instead the instruction is ignored. - */ -#define __kvm_handle_fault_on_reboot(insn) \ - "666: \n\t" \ - insn "\n\t" \ - "jmp 668f \n\t" \ - "667: \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_begin \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "call kvm_spurious_fault \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_end \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "668: \n\t" \ - _ASM_EXTABLE(666b, 667b) - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a6c327f8ad9e..2ef1f6513c68 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -295,6 +295,7 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW_BP 0x00020000 #define KVM_GUESTDBG_INJECT_DB 0x00040000 #define KVM_GUESTDBG_INJECT_BP 0x00080000 +#define KVM_GUESTDBG_BLOCKIRQ 0x00100000 /* for KVM_SET_GUEST_DEBUG */ struct kvm_guest_debug_arch { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a26643dc6bd6..b656456c3a94 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val) } else { local_irq_disable(); + /* safe_halt() will enable IRQ */ if (READ_ONCE(*ptr) == val) safe_halt(); - - local_irq_enable(); + else + local_irq_enable(); } } diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95a98413dc32..54a83a744538 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -7,6 +7,8 @@ #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" +#include "mmu.h" +#include "mmu/mmu_internal.h" static int vcpu_get_timer_advance_ns(void *data, u64 *val) { @@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ &vcpu_tsc_scaling_frac_fops); } } + +/* + * This covers statistics <1024 (11=log(1024)+1), which should be enough to + * cover RMAP_RECYCLE_THRESHOLD. + */ +#define RMAP_LOG_SIZE 11 + +static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" }; + +static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) +{ + struct kvm_rmap_head *rmap; + struct kvm *kvm = m->private; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned int lpage_size, index; + /* Still small enough to be on the stack */ + unsigned int *log[KVM_NR_PAGE_SIZES], *cur; + int i, j, k, l, ret; + + ret = -ENOMEM; + memset(log, 0, sizeof(log)); + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL); + if (!log[i]) + goto out; + } + + mutex_lock(&kvm->slots_lock); + write_lock(&kvm->mmu_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + for (j = 0; j < slots->used_slots; j++) { + slot = &slots->memslots[j]; + for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { + rmap = slot->arch.rmap[k]; + lpage_size = kvm_mmu_slot_lpages(slot, k + 1); + cur = log[k]; + for (l = 0; l < lpage_size; l++) { + index = ffs(pte_list_count(&rmap[l])); + if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE)) + index = RMAP_LOG_SIZE - 1; + cur[index]++; + } + } + } + } + + write_unlock(&kvm->mmu_lock); + mutex_unlock(&kvm->slots_lock); + + /* index=0 counts no rmap; index=1 counts 1 rmap */ + seq_printf(m, "Rmap_Count:\t0\t1\t"); + for (i = 2; i < RMAP_LOG_SIZE; i++) { + j = 1 << (i - 1); + k = (1 << i) - 1; + seq_printf(m, "%d-%d\t", j, k); + } + seq_printf(m, "\n"); + + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]); + cur = log[i]; + for (j = 0; j < RMAP_LOG_SIZE; j++) + seq_printf(m, "%d\t", cur[j]); + seq_printf(m, "\n"); + } + + ret = 0; +out: + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) + kfree(log[i]); + + return ret; +} + +static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + return single_open(file, kvm_mmu_rmaps_stat_show, kvm); +} + +static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + kvm_put_kvm(kvm); + + return single_release(inode, file); +} + +static const struct file_operations mmu_rmaps_stat_fops = { + .open = kvm_mmu_rmaps_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_mmu_rmaps_stat_release, +}; + +int kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, + &mmu_rmaps_stat_fops); + return 0; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 41d2a53c5dea..232a86a6faaf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + int auto_eoi_old, auto_eoi_new; + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); + auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256); + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); + + auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256); + + if (!!auto_eoi_old == !!auto_eoi_new) + return; + + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + if (auto_eoi_new) + hv->synic_auto_eoi_used++; + else + hv->synic_auto_eoi_used--; + + __kvm_request_apicv_update(vcpu->kvm, + !hv->synic_auto_eoi_used, + APICV_INHIBIT_REASON_HYPERV); + + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) synic = to_hv_synic(vcpu); - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so request - * to deactivate APICV permanently. - */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; @@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; + + ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a6e218c6140d..5a69cce4d72d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (!kvm_vcpu_is_bsp(vcpu) || !pit) + /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ + if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 11e4065e1617..bbd4a5d18b5d 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -35,11 +35,7 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba5a27879f1d..76fb00921203 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; + WARN_ONCE(!irqchip_in_kernel(kvm), + "Dirty APIC map without an in-kernel local APIC"); + mutex_lock(&kvm->arch.apic_map_lock); /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map @@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) @@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) struct kvm_lapic *apic = vcpu->arch.apic; int i; + if (!init_event) { + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + if (!apic) return; @@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE); + apic->base_address = APIC_DEFAULT_PHYS_BASE; + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); @@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - kvm_lapic_set_base(vcpu, - vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { @@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } - /* - * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC state has changed. - */ - vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 83e6c6965f1e..e9688a9f7b57 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + +static inline unsigned long +__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, + int level) +{ + return gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; +} + +static inline unsigned long +kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) +{ + return __kvm_mmu_slot_lpages(slot, slot->npages, level); +} + +static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) +{ + atomic64_add(count, &kvm->stat.pages[level - 1]); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 47b765270239..2d7e61122af8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -137,12 +138,22 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -/* make pte_list_desc fit well in cache line */ -#define PTE_LIST_EXT 3 +/* make pte_list_desc fit well in cache lines */ +#define PTE_LIST_EXT 14 +/* + * Slight optimization of cacheline layout, by putting `more' and `spte_count' + * at the start; then accessing it will only use one single cacheline for + * either full (entries==PTE_LIST_EXT) case or entries<=6. + */ struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; + /* + * Stores number of entries stored in the pte_list_desc. No need to be + * u64 but just for easier alignment. When PTE_LIST_EXT, means full. + */ + u64 spte_count; + u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { @@ -193,7 +204,7 @@ struct kvm_mmu_role_regs { * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ -static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ { \ return !!(regs->reg & flag); \ } @@ -215,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ -static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ } @@ -323,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { - /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { - exception->error_code |= PFERR_RSVD_MASK; - return UNMAPPED_GVA; - } - return gpa; } @@ -592,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. - * Returns non-zero if the PTE was previously valid. + * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(u64 *sptep) +static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; + int level = sptep_to_sp(sptep)->role.level; if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); @@ -605,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep) old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_shadow_present_pte(old_spte)) - return 0; + return old_spte; + + kvm_update_page_stats(kvm, level, -1); pfn = spte_to_pfn(old_spte); @@ -622,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); - return 1; + return old_spte; } /* @@ -686,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -786,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, +static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; @@ -799,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, } } -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } @@ -893,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - int i, count = 0; + int count = 0; if (!rmap_head->val) { rmap_printk("%p %llx 0->1\n", spte, *spte); @@ -903,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; + desc->spte_count = 2; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1]) { + while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; - if (!desc->more) { desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; + desc->spte_count = 0; break; } desc = desc->more; } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; + count += desc->spte_count; + desc->sptes[desc->spte_count++] = spte; } return count; } @@ -930,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) { - int j; + int j = desc->spte_count - 1; - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; - if (j != 0) + desc->spte_count--; + if (desc->spte_count) return; if (!prev_desc && !desc->more) rmap_head->val = 0; @@ -969,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(rmap_head, desc, i, prev_desc); @@ -984,30 +999,68 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + u64 *sptep) { - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); __pte_list_remove(sptep, rmap_head); } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +/* Return true if rmap existed, false otherwise */ +static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { - unsigned long idx; + struct pte_list_desc *desc, *next; + int i; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + for (; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(kvm, desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; } -static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, - struct kvm_mmu_page *sp) +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; + struct pte_list_desc *desc; + unsigned int count = 0; - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); + if (!rmap_head->val) + return 0; + else if (!(rmap_head->val & 1)) + return 1; + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + while (desc) { + count += desc->spte_count; + desc = desc->more; + } + + return count; +} + +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, + const struct kvm_memory_slot *slot) +{ + unsigned long idx; + + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1020,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } + static void rmap_remove(struct kvm *kvm, u64 *spte) { + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmap_head = gfn_to_rmap(kvm, gfn, sp); + + /* + * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the + * context of a vCPU so have to determine which memslots to use based + * on context information in sp->role. + */ + slots = kvm_memslots_for_spte_role(kvm, sp->role); + + slot = __gfn_to_memslot(slots, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + __pte_list_remove(spte, rmap_head); } @@ -1119,7 +1187,9 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (mmu_spte_clear_track_bits(sptep)) + u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); + + if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } @@ -1129,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) if (is_large_pte(*sptep)) { WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); - --kvm->stat.lpages; return true; } @@ -1218,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1256,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1289,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ @@ -1356,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); + rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } } @@ -1377,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(kvm, rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1421,13 +1479,13 @@ restart: need_flush = 1; if (pte_write(pte)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( *sptep, new_pfn); - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); mmu_spte_set(sptep, new_spte); } } @@ -1442,7 +1500,7 @@ restart: struct slot_rmap_walk_iterator { /* input fields. */ - struct kvm_memory_slot *slot; + const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; @@ -1462,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, - iterator->slot); + iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - struct kvm_memory_slot *slot, int start_level, + const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; @@ -1584,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, @@ -2232,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); @@ -2716,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); - if (!was_rmapped && is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; - if (is_shadow_present_pte(*sptep)) { - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); - } + if (!was_rmapped) { + kvm_update_page_stats(vcpu->kvm, level, 1); + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); } return ret; @@ -2852,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; + int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -2863,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - return host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + return min(host_level, max_level); } int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2887,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K) - return level; - - *req_level = level = min(level, max_level); - /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - if (huge_page_disallowed) + *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); + if (level == PG_LEVEL_4K || huge_page_disallowed) return PG_LEVEL_4K; /* @@ -2965,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; drop_large_spte(vcpu, it.sptep); - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - - link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + if (is_shadow_present_pte(*it.sptep)) + continue; + + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); + if (is_tdp && huge_page_disallowed && + req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, @@ -3122,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + +/* + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + */ +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +{ struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3141,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3206,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3220,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3455,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_mask; - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (WARN_ON_ONCE(!mmu->pml5_root)) { + r = -EIO; + goto out_unlock; + } + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + } } for (i = 0; i < 4; ++i) { @@ -3482,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3498,7 +3582,10 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *pml4_root, *pae_root; + bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3511,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) return 0; /* - * This mess only works with 4-level paging and needs to be updated to - * work with 5-level paging. + * NPT, the only paging mode that uses this horror, uses a fixed number + * of levels for the shadow page tables, e.g. all MMUs are 4-level or + * all MMus are 5-level. Thus, this can safely require that pml5_root + * is allocated if the other roots are valid and pml5 is needed, as any + * prior MMU would also have required pml5. */ - if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) - return -EIO; - - if (mmu->pae_root && mmu->pml4_root) + if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + (need_pml5 && mmu->pml5_root))) return -EIO; /* @@ -3535,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; +#ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) { - free_page((unsigned long)pae_root); - return -ENOMEM; + if (!pml4_root) + goto err_pml4; + + if (need_pml5) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; } +#endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; return 0; + +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3640,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3647,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3662,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3675,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -3795,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable) + bool write, bool *writable, int *r) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3808,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return true; - - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; - return false; + goto out_retry; + + if (!kvm_is_visible_memslot(slot)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu)) { + *pfn = KVM_PFN_NOSLOT; + *writable = false; + return false; + } + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && + !kvm_apicv_activated(vcpu->kvm)) { + *r = RET_PF_EMULATE; + return true; + } } async = false; @@ -3828,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; + goto out_retry; } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; + goto out_retry; } *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable, hva); - return false; + +out_retry: + *r = RET_PF_RETRY; + return true; } static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -3854,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -3867,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; @@ -4588,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5160,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - if (WARN_ON_ONCE(r == RET_PF_INVALID)) + if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5279,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) */ } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* @@ -5302,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) @@ -5334,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool flush_on_yield) { @@ -5345,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, @@ -5358,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); + free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) @@ -5587,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +/* + * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end + * (not including it) + */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; @@ -5594,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; + write_lock(&kvm->mmu_lock); + + kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + if (kvm_memslots_have_rmaps(kvm)) { - write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5606,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush = slot_handle_level_range(kvm, memslot, + flush = slot_handle_level_range(kvm, + (const struct kvm_memory_slot *) memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - write_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - - read_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush, true); + gfn_end, flush); if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end); - - read_unlock(&kvm->mmu_lock); + gfn_end - gfn_start); } + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + + kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level) { bool flush = false; @@ -5676,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5699,7 +5835,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -5715,10 +5851,8 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *slot) { - /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { @@ -5754,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { bool flush = false; diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index cedc17b2f60e..9e7dcf999f08 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 35567293c1fd..bf2bdbf333c2 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -31,13 +31,16 @@ extern bool dbg; #define IS_VALID_PAE_ROOT(x) (!!(x)) struct kvm_mmu_page { + /* + * Note, "link" through "spt" fit in a single 64 byte cache line on + * 64-bit kernels, keep it that way unless there's a reason not to. + */ struct list_head link; struct hlist_node hash_link; - struct list_head lpage_disallowed_link; + bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* @@ -59,6 +62,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); + struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -71,8 +75,6 @@ struct kvm_mmu_page { atomic_t write_flooding_count; #ifdef CONFIG_X86_64 - bool tdp_mmu_page; - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif @@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). @@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index efbad33a0645..2924a4081a19 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 91a9f7e0fd91..269f11f92fd0 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,6 +16,7 @@ #include <asm/kvm_page_track.h> +#include "mmu.h" #include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ee044d357b5f..7d03e9b7ccfa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d80cb122b5f3..64ccfc1fa553 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,7 +10,7 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = false; +static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ @@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the new page - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. * @account_nx: This page replaces a NX large page and should be marked for * eventual reclaim. */ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared, bool account_nx) + bool account_nx) { - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); - + spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) account_huge_nx_page(kvm, sp); - - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -445,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); - if (is_large_pte(old_spte) != is_large_pte(new_spte)) { - if (is_large_pte(old_spte)) - atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); - else - atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); - } - /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -477,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, return; } + if (is_leaf != was_leaf) + kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) @@ -526,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -537,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +/* + * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a + * TDP page fault. + * + * @vcpu: The vcpu instance that took the TDP page fault. + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, + struct tdp_iter *iter, + u64 new_spte) { + struct kvm *kvm = vcpu->kvm; + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) return false; - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, iter->level); + /* + * Use kvm_vcpu_gfn_to_memslot() instead of going through + * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. + */ + if (is_writable_pte(new_spte)) { + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); + + if (slot && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON_ONCE(iter->level > PG_LEVEL_4K); + mark_page_dirty_in_slot(kvm, slot, iter->gfn); + } + } + return true; } @@ -558,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -789,21 +804,15 @@ retry: * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. - * - * If shared is true, this thread holds the MMU lock in read mode and must - * account for the possibility that other threads are modifying the paging - * structures concurrently. If shared is false, this thread should hold the - * MMU in write mode. */ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared) + gfn_t end, bool can_yield, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, - shared); + false); return flush; } @@ -814,8 +823,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, - flush, false); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); if (flush) kvm_flush_remote_tlbs(kvm); @@ -940,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) return RET_PF_RETRY; /* @@ -1044,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, - new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, true, + if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, huge_page_disallowed && req_level >= iter.level); @@ -1255,8 +1262,8 @@ retry: * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level) +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1326,7 +1333,8 @@ retry: * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1529,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1540,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1cae4485b3bc..358f447d4012 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared); + gfn_t end, bool can_yield, bool flush); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush, - bool shared) + gfn_t start, gfn_t end, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, - shared); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) */ lockdep_assert_held_write(&kvm->mmu_lock); return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false, false); + sp->gfn, end, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); @@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level); +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level); bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, - struct kvm_memory_slot *slot); + const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, @@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 827886c12c16..0772bad9165c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; } static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return; /* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false; clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 67e753edfa22..0e4f2b1fa9fb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running; counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a8ad78a2faa1..8052d92069e0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; else @@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_update_access_page(struct kvm *kvm, bool activate) +static int avic_alloc_access_page(struct kvm *kvm) { void __user *ret; int r = 0; mutex_lock(&kvm->slots_lock); - /* - * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger - * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT - * memory region. So, we need to ensure that kvm->mm == current->mm. - */ - if ((kvm->arch.apic_access_memslot_enabled == activate) || - (kvm->mm != current->mm)) + + if (kvm->arch.apic_access_memslot_enabled) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - activate ? PAGE_SIZE : 0); + PAGE_SIZE); if (IS_ERR(ret)) { r = PTR_ERR(ret); goto out; } - kvm->arch.apic_access_memslot_enabled = activate; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_update_access_page(vcpu->kvm, true); + ret = avic_alloc_access_page(vcpu->kvm); if (ret) return ret; } @@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) -{ - if (!enable_apicv || !lapic_in_kernel(vcpu)) - return; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_request_apicv_update(vcpu->kvm, activate, - APICV_INHIBIT_REASON_IRQWIN); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -} - void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; @@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } vmcb_mark_dirty(vmcb, VMCB_AVIC); + if (activated) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); + svm_set_pi_irte_mode(vcpu, activated); } @@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) -{ - avic_update_access_page(kvm, activate); -} static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) @@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); @@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) struct vcpu_svm *svm = to_svm(vcpu); svm->avic_is_running = is_run; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + if (is_run) avic_vcpu_load(vcpu, vcpu->cpu); else diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e5515477c30a..2545d0c61985 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -666,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) goto out; } - - /* Clear internal status */ - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7fbce342eec4..75e0b21ad07c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -28,8 +28,6 @@ #include "cpuid.h" #include "trace.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - #ifndef CONFIG_KVM_AMD_SEV /* * When this config is not defined, SEV feature is not supported and APIs in @@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xcr0 = svm->vcpu.arch.xcr0; save->pkru = svm->vcpu.arch.pkru; save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; /* * SEV-ES will use a VMSA that is pointed to by the VMCB, not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 69639f9624f5..05e8d4d27969 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -46,8 +46,6 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -261,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr) static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 - return PT64_ROOT_4LEVEL; + return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; #else return PT32E_ROOT_LEVEL; #endif @@ -462,11 +460,6 @@ static int has_svm(void) return 0; } - if (pgtable_l5_enabled()) { - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); - return 0; - } - return 1; } @@ -1015,7 +1008,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ @@ -1161,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - vcpu->arch.hflags = 0; - svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); svm_set_intercept(svm, INTERCEPT_CR4_READ); @@ -1241,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; + save->gdtr.base = 0; save->gdtr.limit = 0xffff; + save->idtr.base = 0; save->idtr.limit = 0xffff; init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(vcpu, 0); - svm_set_efer(vcpu, 0); - save->dr6 = 0xffff0ff0; - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - save->rip = 0x0000fff0; - vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; - - /* - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - * It also updates the guest-visible cr0 value. - */ - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(vcpu); - - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; @@ -1273,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = vcpu->arch.pat; save->cr3 = 0; - save->cr4 = 0; } svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = INVALID_GPA; svm->nested.last_vmcb12_gpa = INVALID_GPA; - vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; @@ -1330,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu) static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); - u32 dummy; - u32 eax = 1; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } init_vmcb(vcpu); - - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); - kvm_rdx_write(vcpu, eax); - - if (kvm_vcpu_apicv_active(vcpu) && !init_event) - avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1513,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); } - avic_vcpu_load(vcpu, cpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; @@ -1560,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); } } @@ -2078,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) return -EINVAL; /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. + * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put + * the VMCB in a known good state. Unfortuately, KVM doesn't have + * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking + * userspace. At a platform view, INIT is acceptable behavior as + * there exist bare metal platforms that automatically INIT the CPU + * in response to shutdown. */ clear_page(svm->vmcb); - init_vmcb(vcpu); + kvm_vcpu_reset(vcpu, true); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2993,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->msr_decfg = data; break; } - case MSR_IA32_APICBASE: - if (kvm_vcpu_apicv_active(vcpu)) - avic_update_vapic_bar(to_svm(vcpu), data); - fallthrough; default: return kvm_set_msr_common(vcpu, msr); } @@ -3021,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(vcpu, true); + kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3269,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]) - return 0; + return (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]); +} +static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +{ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3282,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - - return -EINVAL; + return 0; } int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(vcpu, exit_code)) - return 0; + if (!svm_check_exit_valid(vcpu, exit_code)) + return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) @@ -3573,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - svm_toggle_avic_for_irq_window(vcpu, false); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } @@ -3808,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -4610,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bd0fe94c2920..524d943f3efc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) -{ - svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); -} - static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -524,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); @@ -534,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu); void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 8170f2a5a16f..22e2b019de37 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -4,7 +4,7 @@ #include <linux/compiler_types.h> -#include <asm/kvm_host.h> +#include "x86.h" #define svm_asm(insn, clobber...) \ do { \ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 896b2a50b4aa..0dab1b7b529f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #if IS_ENABLED(CONFIG_HYPERV) -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 2ec9b46f0d0c..152ab0aa82cf 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -73,8 +73,6 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -#undef ROL16 - static inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b3f77d18eb5a..ccb03d69546c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2207,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, } } -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, + struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); @@ -2218,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * PIN CONTROLS */ - exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { + else exec_control &= ~PIN_BASED_POSTED_INTR; - } pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; @@ -2271,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; + exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | @@ -2282,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING); + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_DESC); + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2322,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). */ - exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + exec_control = __vm_entry_controls_get(vmcs01); + exec_control |= vmcs12->vm_entry_controls; + exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; @@ -2339,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - exec_control = vmx_vmexit_ctrl(); + exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; + else + exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* @@ -3384,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - prepare_vmcs02_early(vmx, vmcs12); + prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { @@ -4304,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, seg.l = 1; else seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, @@ -4315,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .g = 1 }; seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, @@ -4333,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .type = 11, .present = 1 }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); @@ -4419,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9efc1a6b8693..10cc4f65c4ef 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 4b9957e2bf5b..6e5de2e2b0da 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,6 +11,8 @@ #include "capabilities.h" +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + struct vmcs_hdr { u32 revision_id:31; u32 shadow_vmcs:1; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index d9f5d7c56ae3..cab6ba7a5005 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -2,7 +2,6 @@ #include "vmcs12.h" -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 5e0e1b39f495..2a45f026ee11 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void) extern const unsigned short vmcs_field_to_offset_table[]; extern const unsigned int nr_vmcs12_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static inline short vmcs_field_to_offset(unsigned long field) { unsigned short offset; @@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field) return offset; } -#undef ROL16 - static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..0c2c0d5ae873 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, } /* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. */ -static void setup_msrs(struct vcpu_vmx *vmx) +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; @@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. @@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (is_unrestricted_guest(vcpu) || - (enable_ept && is_paging(vcpu))) + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); break; } } @@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } - vmx_set_segment(vcpu, save, seg); + __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); return 0; } @@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + unsigned long hw_cr0, old_cr0_pg; + u32 tmp; + + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); } + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (enable_ept && !is_unrestricted_guest(vcpu)) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ + if (!(cr0 & X86_CR0_PG)) { + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); @@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; + return; } vmcs_writel(sf->base, var->base); @@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} -out: - vmx->emulation_required = emulation_required(vcpu); +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (secondary_exec_controls_get(to_vmx(vcpu)) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; @@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + if (!cpu_has_vmx_msr_bitmap()) return; + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + vmx_reset_x2apic_msrs(vcpu, mode); /* @@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, mode); - - vmx->msr_bitmap_mode = mode; -} - void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } pt_update_intercept_for_msr(vcpu); - vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; @@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } +static u32 vmx_vmentry_ctrl(void) +{ + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + + if (vmx_pt_mode_is_system()) + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static u32 vmx_vmexit_ctrl(void) +{ + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + + if (vmx_pt_mode_is_system()) + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } -u32 vmx_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; - vmx->secondary_exec_control = exec_control; + return exec_control; } #define VMX_XSS_EXIT_BITMAP 0 @@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - secondary_exec_controls_set(vmx, vmx->secondary_exec_control); - } + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - vmx_update_exception_bitmap(vcpu); - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * exc_debug expects dr6 to be cleared after it runs, avoid that it sees + * a stale dr6 from the guest. + */ + set_debugreg(DR6_RESERVED, 6); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - WARN_ON_ONCE(!enable_vnmi); + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * below) should never happen as that means we incorrectly allowed a * nested VM-Enter with an invalid vmcs12. */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) } secondary_exec_controls_set(vmx, sec_exec_control); - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) @@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) int max_irr; bool max_irr_updated; - WARN_ON(!vcpu->arch.apicv_active); + if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + return -EIO; + if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* @@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; - if (WARN_ONCE(!is_external_intr(intr_info), + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->emulation_required) + return; + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) @@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + set_debugreg(vcpu->arch.dr6, 6); + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } - vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); @@ -6997,7 +7017,7 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 new_ctl = vmx->secondary_exec_control; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); @@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx); - } + vmx_setup_uret_msrs(vmx); + + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(vmx, + vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -7803,7 +7823,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 17a1cb4b059d..4858c5fd95f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -227,7 +227,7 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; - u8 msr_bitmap_mode; + u8 x2apic_msr_bitmap_mode; /* * If true, host state has been stored in vmx->loaded_vmcs for @@ -263,8 +263,6 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; - u32 secondary_exec_control; - /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -371,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); @@ -419,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ } \ +static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ { \ - return vmx->loaded_vmcs->controls_shadow.lname; \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ } \ static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ { \ @@ -451,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; } -static inline u32 vmx_vmentry_ctrl(void) -{ - u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (vmx_pt_mode_is_system()) - vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); -} - -static inline u32 vmx_vmexit_ctrl(void) -{ - u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (vmx_pt_mode_is_system()) - vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -} - -u32 vmx_exec_control(struct vcpu_vmx *vmx); -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); - static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 164b64f65a8f..9e9ef47e988c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,13 +4,11 @@ #include <linux/nospec.h> -#include <asm/kvm_host.h> #include <asm/vmx.h> #include "evmcs.h" #include "vmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5d5c5ed7dd4..28ef14155726 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), - STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible noinstr void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } @@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } @@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -6567,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -8578,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { + mutex_init(&kvm->arch.apicv_update_lock); + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); @@ -8891,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) can_inject = false; } + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) + return 0; + /* * Finally, inject interrupt events. If an event cannot be injected * due to architectural conditions (e.g. IF=0) a window-open exit @@ -9236,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + bool activate; + if (!lapic_in_kernel(vcpu)) return; - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + activate = kvm_apicv_activated(vcpu->kvm); + if (vcpu->arch.apicv_active == activate) + goto out; + + vcpu->arch.apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9251,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) */ if (!vcpu->arch.apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + +out: + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - struct kvm_vcpu *except; - unsigned long old, new, expected; + unsigned long old, new; if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; - old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); - do { - expected = new = old; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); - if (new == old) - break; - old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); - } while (old != expected); - - if (!!old == !!new) - return; + old = new = kvm->arch.apicv_inhibit_reasons; - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + + if (!!old != !!new) { + trace_kvm_apicv_update_request(activate, bit); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + kvm_zap_gfn_range(kvm, gfn, gfn+1); + } + } else + kvm->arch.apicv_inhibit_reasons = new; +} +EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + mutex_lock(&kvm->arch.apicv_update_lock); + __kvm_request_apicv_update(kvm, activate, bit); + mutex_unlock(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9395,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + r = -EIO; + goto out; + } if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; @@ -9608,8 +9620,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } @@ -9639,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -9976,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } @@ -10581,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; @@ -10799,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10865,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. + * But, go through the motions in case that's ever remedied. + */ + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = 0x600; + kvm_rdx_write(vcpu, eax); + vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + /* * Reset the MMU context if paging was enabled prior to INIT (which is * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the @@ -10879,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) kvm_mmu_reset_context(vcpu); + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -11123,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); } @@ -11312,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { int level = i + 1; - int lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); WARN_ON(slot->arch.rmap[i]); @@ -11396,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) @@ -11481,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -11561,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44ae10312740..7d66d63dc55a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,8 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +void kvm_spurious_fault(void); + static __always_inline void kvm_guest_enter_irqoff(void) { /* diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ae17250e1efe..9ea9c3dabe37 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; - int ret; + int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, - gpa, PAGE_SIZE); - if (ret) + if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { + ret = -EFAULT; goto out; - - kvm->arch.xen.shinfo_set = true; + } + kvm->arch.xen.shinfo_gfn = gfn; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_set = false; + kvm->arch.xen.shinfo_gfn = GPA_INVALID; r = 0; break; } @@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_set) - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); - else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); r = 0; break; @@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } +void kvm_xen_init_vm(struct kvm *kvm) +{ + kvm->arch.xen.shinfo_gfn = GPA_INVALID; +} + void kvm_xen_destroy_vm(struct kvm *kvm) { if (kvm->arch.xen_hvm_config.msr) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 463a7844a8ca..cc0cf5f37450 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) @@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) return 1; } +static inline void kvm_xen_init_vm(struct kvm *kvm) +{ +} + static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 136b8d97d8c0..0d7865a0731c 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -2,7 +2,11 @@ #ifndef __LINUX_ENTRYKVM_H #define __LINUX_ENTRYKVM_H -#include <linux/entry-common.h> +#include <linux/static_call_types.h> +#include <linux/tracehook.h> +#include <linux/syscalls.h> +#include <linux/seccomp.h> +#include <linux/sched.h> #include <linux/tick.h> /* Transfer to guest mode work */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ae7735b490b4..041ca7f15ea4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,6 +150,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 +#define KVM_REQ_VM_BUGGED (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -158,6 +159,15 @@ static inline bool is_error_page(struct page *page) }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) +bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, + unsigned long *vcpu_bitmap, cpumask_var_t tmp); +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except); +bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, + unsigned long *vcpu_bitmap); + #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -344,6 +354,13 @@ struct kvm_vcpu { struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; + + /* + * The index of the most recently used memslot by this vCPU. It's ok + * if this becomes stale due to memslot changes since we always check + * it is a valid slot. + */ + int last_used_slot; }; /* must be called with irqs disabled */ @@ -512,7 +529,7 @@ struct kvm_memslots { u64 generation; /* The mapping table from slot id to the index in memslots[]. */ short id_to_index[KVM_MEM_SLOTS_NUM]; - atomic_t lru_slot; + atomic_t last_used_slot; int used_slots; struct kvm_memory_slot memslots[]; }; @@ -538,6 +555,11 @@ struct kvm { struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + /* Used to wait for completion of MMU notifiers. */ + spinlock_t mn_invalidate_lock; + unsigned long mn_active_invalidate_count; + struct rcuwait mn_memslots_update_rcuwait; + /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only @@ -596,6 +618,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool vm_bugged; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -629,6 +652,30 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline void kvm_vm_bugged(struct kvm *kvm) +{ + kvm->vm_bugged = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED); +} + +#define KVM_BUG(cond, kvm, fmt...) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + +#define KVM_BUG_ON(cond, kvm) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); @@ -720,6 +767,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); +bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); @@ -824,7 +872,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_get_pfn(kvm_pfn_t pfn); void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, @@ -943,14 +990,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, - unsigned long *vcpu_bitmap, cpumask_var_t tmp); -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except); -bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, - unsigned long *vcpu_bitmap); +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -1034,6 +1077,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); +int kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* @@ -1157,29 +1201,49 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* - * search_memslots() and __gfn_to_memslot() are here because they are - * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. - * gfn_to_memslot() itself isn't here as an inline because that would - * bloat other code too much. + * Returns a pointer to the memslot at slot_index if it contains gfn. + * Otherwise returns NULL. + */ +static inline struct kvm_memory_slot * +try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + if (slot_index < 0 || slot_index >= slots->used_slots) + return NULL; + + /* + * slot_index can come from vcpu->last_used_slot which is not kept + * in sync with userspace-controllable memslot deletion. So use nospec + * to prevent the CPU from speculating past the end of memslots[]. + */ + slot_index = array_index_nospec(slot_index, slots->used_slots); + slot = &slots->memslots[slot_index]; + + if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) + return slot; + else + return NULL; +} + +/* + * Returns a pointer to the memslot that contains gfn and records the index of + * the slot in index. Otherwise returns NULL. * * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); struct kvm_memory_slot *memslots = slots->memslots; + struct kvm_memory_slot *slot; if (unlikely(!slots->used_slots)) return NULL; - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return &memslots[slot]; - while (start < end) { - slot = start + (end - start) / 2; + int slot = start + (end - start) / 2; if (gfn >= memslots[slot].base_gfn) end = slot; @@ -1187,19 +1251,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) start = slot + 1; } - if (start < slots->used_slots && gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); - return &memslots[start]; + slot = try_get_memslot(slots, start, gfn); + if (slot) { + *index = start; + return slot; } return NULL; } +/* + * __gfn_to_memslot() and its descendants are here because it is called from + * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot() + * itself isn't here as an inline because that would bloat other code too much. + */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { - return search_memslots(slots, gfn); + struct kvm_memory_slot *slot; + int slot_index = atomic_read(&slots->last_used_slot); + + slot = try_get_memslot(slots, slot_index, gfn); + if (slot) + return slot; + + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + atomic_set(&slots->last_used_slot, slot_index); + return slot; + } + + return NULL; } static inline unsigned long @@ -1273,56 +1355,66 @@ struct _kvm_stats_desc { char name[KVM_STATS_NAME_SIZE]; }; -#define STATS_DESC_COMMON(type, unit, base, exp) \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ - .size = 1 + .size = sz, \ + .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VM_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } -#define VCPU_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ -#define STATS_DESC(SCOPE, stat, type, unit, base, exp) \ - SCOPE##_STATS_DESC(stat, type, unit, base, exp) +#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ + SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ + unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ + unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ + unit, base, exponent, 1, 0) +#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ + unit, base, exponent, sz, bsz) +#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ + unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ @@ -1341,9 +1433,18 @@ struct _kvm_stats_desc { #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) +/* Linear histogram for time in nanosecond */ +#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ + STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz, bsz) +/* Logarithmic histogram for time in nanosecond */ +#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ + STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ - STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ @@ -1351,13 +1452,62 @@ struct _kvm_stats_desc { STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ - STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ + HALT_POLL_HIST_COUNT) extern struct dentry *kvm_debugfs_dir; + ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); + +/** + * kvm_stats_linear_hist_update() - Update bucket value for linear histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the linear histogram's bucket + * @bucket_size: the size (width) of a bucket + */ +static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, + u64 value, size_t bucket_size) +{ + size_t index = div64_u64(value, bucket_size); + + index = min(index, size - 1); + ++data[index]; +} + +/** + * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the logarithmic histogram's bucket + */ +static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) +{ + size_t index = fls64(value); + + index = min(index, size - 1); + ++data[index]; +} + +#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ + kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) +#define KVM_STATS_LOG_HIST_UPDATE(array, value) \ + kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) + + extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index ed6a985c5680..2237abb93ccd 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -76,8 +76,11 @@ struct kvm_mmu_memory_cache { }; #endif +#define HALT_POLL_HIST_COUNT 32 + struct kvm_vm_stat_generic { u64 remote_tlb_flush; + u64 remote_tlb_flush_requests; }; struct kvm_vcpu_stat_generic { @@ -87,6 +90,10 @@ struct kvm_vcpu_stat_generic { u64 halt_wakeup; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 halt_wait_ns; + u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT]; + u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT]; + u64 halt_wait_hist[HALT_POLL_HIST_COUNT]; }; #define KVM_STATS_NAME_SIZE 48 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..1ace27c4a8e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -633,43 +633,6 @@ static inline int PageTransCompound(struct page *page) } /* - * PageTransCompoundMap is the same as PageTransCompound, but it also - * guarantees the primary MMU has the entire compound page mapped - * through pmd_trans_huge, which in turn guarantees the secondary MMUs - * can also map the entire compound page. This allows the secondary - * MMUs to call get_user_pages() only once for each compound page and - * to immediately map the entire compound page with a single secondary - * MMU fault. If there will be a pmd split later, the secondary MMUs - * will get an update through the MMU notifier invalidation through - * split_huge_pmd(). - * - * Unlike PageTransCompound, this is safe to be called only while - * split_huge_pmd() cannot run from under us, like if protected by the - * MMU notifier, otherwise it may result in page->_mapcount check false - * positives. - * - * We have to treat page cache THP differently since every subpage of it - * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE - * mapped in the current process so comparing subpage's _mapcount to - * compound_mapcount to filter out PTE mapped case. - */ -static inline int PageTransCompoundMap(struct page *page) -{ - struct page *head; - - if (!PageTransCompound(page)) - return 0; - - if (PageAnon(page)) - return atomic_read(&page->_mapcount) < 0; - - head = compound_head(page); - /* File THP is PMD mapped and not PTE mapped */ - return atomic_read(&page->_mapcount) == - atomic_read(compound_mapcount_ptr(head)); -} - -/* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d9e4aabcb31a..a067410ebea5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1965,7 +1965,9 @@ struct kvm_stats_header { #define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) -#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_PEAK +#define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST #define KVM_STATS_UNIT_SHIFT 4 #define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) @@ -1988,8 +1990,9 @@ struct kvm_stats_header { * @size: The number of data items for this stats. * Every data item is of type __u64. * @offset: The offset of the stats to the start of stat structure in - * struture kvm or kvm_vcpu. - * @unused: Unused field for future usage. Always 0 for now. + * structure kvm or kvm_vcpu. + * @bucket_size: A parameter value used for histogram stats. It is only used + * for linear histogram stats, specifying the size of the bucket; * @name: The name string for the stats. Its size is indicated by the * &kvm_stats_header->name_size. */ @@ -1998,7 +2001,7 @@ struct kvm_stats_desc { __s16 exponent; __u16 size; __u32 offset; - __u32 unused; + __u32 bucket_size; char name[]; }; diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 0709af0144c8..98053d3afbda 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only /aarch64/debug-exceptions /aarch64/get-reg-list +/aarch64/psci_cpu_on_test /aarch64/vgic_init /s390x/memop /s390x/resets diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 5832f510a16c..5d05801ab816 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -86,6 +86,7 @@ TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list +TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c new file mode 100644 index 000000000000..018c269990e1 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the + * CPU_ON PSCI call matches what the caller requested. + * + * Copyright (c) 2021 Google LLC. + * + * This is a regression test for a race between KVM servicing the PSCI call and + * userspace reading the vCPUs registers. + */ + +#define _GNU_SOURCE + +#include <linux/psci.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define VCPU_ID_SOURCE 0 +#define VCPU_ID_TARGET 1 + +#define CPU_ON_ENTRY_ADDR 0xfeedf00dul +#define CPU_ON_CONTEXT_ID 0xdeadc0deul + +static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr, + uint64_t context_id) +{ + register uint64_t x0 asm("x0") = PSCI_0_2_FN64_CPU_ON; + register uint64_t x1 asm("x1") = target_cpu; + register uint64_t x2 asm("x2") = entry_addr; + register uint64_t x3 asm("x3") = context_id; + + asm("hvc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2), "r"(x3) + : "memory"); + + return x0; +} + +static uint64_t psci_affinity_info(uint64_t target_affinity, + uint64_t lowest_affinity_level) +{ + register uint64_t x0 asm("x0") = PSCI_0_2_FN64_AFFINITY_INFO; + register uint64_t x1 asm("x1") = target_affinity; + register uint64_t x2 asm("x2") = lowest_affinity_level; + + asm("hvc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2) + : "memory"); + + return x0; +} + +static void guest_main(uint64_t target_cpu) +{ + GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID)); + uint64_t target_state; + + do { + target_state = psci_affinity_info(target_cpu, 0); + + GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) || + (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF)); + } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON); + + GUEST_DONE(); +} + +int main(void) +{ + uint64_t target_mpidr, obs_pc, obs_x0; + struct kvm_vcpu_init init; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name); + ucall_init(vm, NULL); + + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); + init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); + + aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_main); + + /* + * make sure the target is already off when executing the test. + */ + init.features[0] |= (1 << KVM_ARM_VCPU_POWER_OFF); + aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_main); + + get_reg(vm, VCPU_ID_TARGET, ARM64_SYS_REG(MPIDR_EL1), &target_mpidr); + vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK); + vcpu_run(vm, VCPU_ID_SOURCE); + + switch (get_ucall(vm, VCPU_ID_SOURCE, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, + uc.args[1]); + break; + default: + TEST_FAIL("Unhandled ucall: %lu", uc.cmd); + } + + get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.pc), &obs_pc); + get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + + TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, + "unexpected target cpu pc: %lx (expected: %lx)", + obs_pc, CPU_ON_ENTRY_ADDR); + TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID, + "unexpected target context id: %lx (expected: %lx)", + obs_x0, CPU_ON_CONTEXT_ID); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index e2baa187a21e..71e277c7c3f3 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -222,8 +222,6 @@ static void *vcpu_thread_main(void *arg) int vcpu_id = vcpu_args->vcpu_id; int current_iteration = -1; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); - while (spin_wait_for_next_iteration(¤t_iteration)) { switch (READ_ONCE(iteration_work)) { case ITERATION_ACCESS_MEMORY: @@ -333,7 +331,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pthread_t *vcpu_threads; int vcpus = params->vcpus; - vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, + vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, params->backing_src); perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index b74704305835..e79c1b64977f 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -52,7 +52,6 @@ static void *vcpu_worker(void *data) struct timespec start; struct timespec ts_diff; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); clock_gettime(CLOCK_MONOTONIC, &start); @@ -293,7 +292,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; int r; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type); perf_test_args.wr_fract = 1; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 80cbd3a748c0..3c30d0045d8d 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -44,7 +44,6 @@ static void *vcpu_worker(void *data) struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); while (!READ_ONCE(host_quit)) { @@ -94,8 +93,59 @@ struct test_params { int wr_fract; bool partition_vcpu_memory_access; enum vm_mem_backing_src_type backing_src; + int slots; }; +static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; + + vm_mem_region_set_flags(vm, slot, flags); + } +} + +static inline void enable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, true); +} + +static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, false); +} + +static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, + uint64_t nr_pages) +{ + uint64_t slot_pages = nr_pages / slots; + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + unsigned long *slot_bitmap = bitmap + i * slot_pages; + + kvm_vm_get_dirty_log(vm, slot, slot_bitmap); + } +} + +static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, + uint64_t nr_pages) +{ + uint64_t slot_pages = nr_pages / slots; + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + unsigned long *slot_bitmap = bitmap + i * slot_pages; + + kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages); + } +} + static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; @@ -114,7 +164,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec clear_dirty_log_total = (struct timespec){0}; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - p->backing_src); + p->slots, p->backing_src); perf_test_args.wr_fract = p->wr_fract; @@ -163,8 +213,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Enable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, - KVM_MEM_LOG_DIRTY_PAGES); + enable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -190,8 +239,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap); - + get_dirty_log(vm, p->slots, bmap, host_num_pages); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -200,9 +248,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); - + clear_dirty_log(vm, p->slots, bmap, host_num_pages); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -213,7 +259,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0); + disable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -244,7 +290,8 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] " - "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name); + "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" + "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -263,6 +310,8 @@ static void help(char *name) " them into a separate region of memory for each vCPU.\n"); printf(" -s: specify the type of memory that should be used to\n" " back the guest data region.\n\n"); + printf(" -x: Split the memory region into this number of memslots.\n" + " (default: 1)"); backing_src_help(); puts(""); exit(0); @@ -276,6 +325,7 @@ int main(int argc, char *argv[]) .wr_fract = 1, .partition_vcpu_memory_access = true, .backing_src = VM_MEM_SRC_ANONYMOUS, + .slots = 1, }; int opt; @@ -286,7 +336,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) { + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) { switch (opt) { case 'i': p.iterations = atoi(optarg); @@ -316,6 +366,9 @@ int main(int argc, char *argv[]) case 's': p.backing_src = parse_backing_src_type(optarg); break; + case 'x': + p.slots = atoi(optarg); + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 27dc5c2e56b9..c0273aefa63d 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -17,6 +17,7 @@ #define CPACR_EL1 3, 0, 1, 0, 2 #define TCR_EL1 3, 0, 2, 0, 2 #define MAIR_EL1 3, 0, 10, 2, 0 +#define MPIDR_EL1 3, 0, 0, 0, 5 #define TTBR0_EL1 3, 0, 2, 0, 0 #define SCTLR_EL1 3, 0, 1, 0, 0 #define VBAR_EL1 3, 0, 12, 0, 0 @@ -40,6 +41,8 @@ (0xfful << (4 * 8)) | \ (0xbbul << (5 * 8))) +#define MPIDR_HWID_BITMASK (0xff00fffffful) + static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) { struct kvm_one_reg reg; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 005f2143adeb..df9f1a3a3ffb 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -44,7 +44,7 @@ extern struct perf_test_args perf_test_args; extern uint64_t guest_test_phys_mem; struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src); void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 5906bbc08483..17f65d514915 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -109,6 +109,18 @@ static void stats_test(int stats_fd) /* Check size field, which should not be zero */ TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0", pdesc->name); + /* Check bucket_size field */ + switch (pdesc->flags & KVM_STATS_TYPE_MASK) { + case KVM_STATS_TYPE_LINEAR_HIST: + TEST_ASSERT(pdesc->bucket_size, + "Bucket size of Linear Histogram stats (%s) is zero", + pdesc->name); + break; + default: + TEST_ASSERT(!pdesc->bucket_size, + "Bucket size of stats (%s) is not zero", + pdesc->name); + } size_data += pdesc->size * sizeof(*stats_data); } /* Check overlap */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index b488f4aefea8..0ef80dbdc116 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -50,11 +50,12 @@ static void guest_code(uint32_t vcpu_id) } struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src) { struct kvm_vm *vm; uint64_t guest_num_pages; + int i; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); @@ -68,6 +69,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, "Guest memory size is not host page size aligned."); TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, @@ -95,10 +99,16 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - /* Add an extra memory slot for testing */ - vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem, - PERF_TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = guest_test_phys_mem + + region_pages * perf_test_args.guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + PERF_TEST_MEM_SLOT_INDEX + i, + region_pages, 0); + } /* Do mapping for the demand paging memory slot */ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); @@ -140,6 +150,8 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, vcpu_gpa = guest_test_phys_mem; } + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + (vcpu_args->pages * perf_test_args.guest_page_size)); diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 98351ba0933c..4cfcafea9f5a 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -45,7 +45,6 @@ static void *vcpu_worker(void *data) struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); /* Let the guest access its memory until a stop signal is received */ @@ -105,7 +104,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; int vcpu_id; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS); perf_test_args.wr_fract = 1; diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 6097a8283377..5f078db1bcba 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -8,12 +8,15 @@ #include <string.h> #include "kvm_util.h" #include "processor.h" +#include "apic.h" #define VCPU_ID 0 #define DR6_BD (1 << 13) #define DR7_GD (1 << 13) +#define IRQ_VECTOR 0xAA + /* For testing data access debug BP */ uint32_t guest_value; @@ -21,6 +24,11 @@ extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; static void guest_code(void) { + /* Create a pending interrupt on current vCPU */ + x2apic_enable(); + x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | + APIC_DM_FIXED | IRQ_VECTOR); + /* * Software BP tests. * @@ -38,12 +46,19 @@ static void guest_code(void) "mov %%rax,%0;\n\t write_data:" : "=m" (guest_value) : : "rax"); - /* Single step test, covers 2 basic instructions and 2 emulated */ + /* + * Single step test, covers 2 basic instructions and 2 emulated + * + * Enable interrupts during the single stepping to see that + * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ + */ asm volatile("ss_start: " + "sti\n\t" "xor %%eax,%%eax\n\t" "cpuid\n\t" "movl $0x1a0,%%ecx\n\t" "rdmsr\n\t" + "cli\n\t" : : : "eax", "ebx", "ecx", "edx"); /* DR6.BD test */ @@ -72,11 +87,13 @@ int main(void) uint64_t cmd; int i; /* Instruction lengths starting at ss_start */ - int ss_size[4] = { + int ss_size[6] = { + 1, /* sti*/ 2, /* xor */ 2, /* cpuid */ 5, /* mov */ 2, /* rdmsr */ + 1, /* cli */ }; if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { @@ -154,7 +171,8 @@ int main(void) for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { target_rip += ss_size[i]; CLEAR_DEBUG(); - debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP | + KVM_GUESTDBG_BLOCKIRQ; debug.arch.debugreg[7] = 0x00000400; APPLY_DEBUG(); vcpu_run(vm, VCPU_ID); diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c index e609d428811a..eefca6c69f51 100644 --- a/virt/kvm/binary_stats.c +++ b/virt/kvm/binary_stats.c @@ -136,9 +136,7 @@ ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, src = stats + pos - header->data_offset; if (copy_to_user(dest, src, copylen)) return -EFAULT; - remain -= copylen; pos += copylen; - dest += copylen; } *offset = pos; diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 7aafefc50aa7..88f4683198ea 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -91,11 +91,6 @@ static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) gfn->flags = KVM_DIRTY_GFN_F_DIRTY; } -static inline bool kvm_dirty_gfn_invalid(struct kvm_dirty_gfn *gfn) -{ - return gfn->flags == 0; -} - static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) { return gfn->flags & KVM_DIRTY_GFN_F_RESET; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b50dbe269f4b..439d3b4cd1a9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -189,16 +189,6 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) return true; } -bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) -{ - struct page *page = pfn_to_page(pfn); - - if (!PageTransCompoundMap(page)) - return false; - - return is_transparent_hugepage(compound_head(page)); -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -318,6 +308,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) */ long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); + ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. @@ -415,6 +406,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + vcpu->last_used_slot = 0; } void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -496,17 +488,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - /* The on_lock() path does not yet support lock elision. */ - if (!IS_KVM_NULL_FN(range->on_lock)) { - locked = true; - KVM_MMU_LOCK(kvm); - - range->on_lock(kvm, range->start, range->end); - - if (IS_KVM_NULL_FN(range->handler)) - goto out_unlock; - } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, slots) { @@ -538,6 +519,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (!locked) { locked = true; KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm, range->start, range->end); + if (IS_KVM_NULL_FN(range->handler)) + break; } ret |= range->handler(kvm, &gfn_range); } @@ -546,7 +531,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) kvm_flush_remote_tlbs(kvm); -out_unlock: if (locked) KVM_MMU_UNLOCK(kvm); @@ -604,16 +588,20 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, trace_kvm_set_spte_hva(address); /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(), - * and so always runs with an elevated notifier count. This obviates - * the need to bump the sequence count. + * .change_pte() must be surrounded by .invalidate_range_{start,end}(). + * If mmu_notifier_count is zero, then no in-progress invalidations, + * including this one, found a relevant memslot at start(); rechecking + * memslots here is unnecessary. Note, a false positive (count elevated + * by a different invalidation) is sub-optimal but functionally ok. */ - WARN_ON_ONCE(!kvm->mmu_notifier_count); + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + if (!READ_ONCE(kvm->mmu_notifier_count)) + return; kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } -static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -658,12 +646,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, trace_kvm_unmap_hva_range(range->start, range->end); + /* + * Prevent memslot modification between range_start() and range_end() + * so that conditionally locking provides the same result in both + * functions. Without that guarantee, the mmu_notifier_count + * adjustments will be imbalanced. + * + * Pairs with the decrement in range_end(). + */ + spin_lock(&kvm->mn_invalidate_lock); + kvm->mn_active_invalidate_count++; + spin_unlock(&kvm->mn_invalidate_lock); + __kvm_handle_hva_range(kvm, &hva_range); return 0; } -static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -694,9 +694,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; + bool wake; __kvm_handle_hva_range(kvm, &hva_range); + /* Pairs with the increment in range_start(). */ + spin_lock(&kvm->mn_invalidate_lock); + wake = (--kvm->mn_active_invalidate_count == 0); + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * There can only be one waiter, since the wait happens under + * slots_lock. + */ + if (wake) + rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + BUG_ON(kvm->mmu_notifier_count < 0); } @@ -897,7 +910,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; - int i; + int i, ret; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; @@ -954,6 +967,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } + + ret = kvm_arch_create_vm_debugfs(kvm); + if (ret) { + kvm_destroy_vm_debugfs(kvm); + return i; + } + return 0; } @@ -974,6 +994,17 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) { } +/* + * Called after per-vm debugfs created. When called kvm->debugfs_dentry should + * be setup already, so we can create arch-specific debugfs entries under it. + * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so + * a per-arch destroy interface is not needed. + */ +int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + return 0; +} + static struct kvm *kvm_create_vm(unsigned long type) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -991,6 +1022,9 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); + spin_lock_init(&kvm->mn_invalidate_lock); + rcuwait_init(&kvm->mn_memslots_update_rcuwait); + INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1113,6 +1147,16 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); + /* + * At this point, pending calls to invalidate_range_start() + * have completed but no more MMU notifiers will run, so + * mn_active_invalidate_count may remain unbalanced. + * No threads can be waiting in install_new_memslots as the + * last reference on KVM has been dropped, but freeing + * memslots would deadlock without this manual intervention. + */ + WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); + kvm->mn_active_invalidate_count = 0; #else kvm_arch_flush_shadow_all(kvm); #endif @@ -1134,6 +1178,16 @@ void kvm_get_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_get_kvm); +/* + * Make sure the vm is not during destruction, which is a safe version of + * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. + */ +bool kvm_get_kvm_safe(struct kvm *kvm) +{ + return refcount_inc_not_zero(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); + void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) @@ -1194,8 +1248,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, slots->used_slots--; - if (atomic_read(&slots->lru_slot) >= slots->used_slots) - atomic_set(&slots->lru_slot, 0); + if (atomic_read(&slots->last_used_slot) >= slots->used_slots) + atomic_set(&slots->last_used_slot, 0); for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { mslots[i] = mslots[i + 1]; @@ -1364,7 +1418,22 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + /* + * Do not store the new memslots while there are invalidations in + * progress, otherwise the locking in invalidate_range_start and + * invalidate_range_end will be unbalanced. + */ + spin_lock(&kvm->mn_invalidate_lock); + prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); + while (kvm->mn_active_invalidate_count) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&kvm->mn_invalidate_lock); + schedule(); + spin_lock(&kvm->mn_invalidate_lock); + } + finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); + spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize @@ -1980,7 +2049,26 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memory_slot *slot; + int slot_index; + + slot = try_get_memslot(slots, vcpu->last_used_slot, gfn); + if (slot) + return slot; + + /* + * Fall back to searching all memslots. We purposely use + * search_memslots() instead of __gfn_to_memslot() to avoid + * thrashing the VM-wide last_used_index in kvm_memslots. + */ + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + vcpu->last_used_slot = slot_index; + return slot; + } + + return NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); @@ -2239,7 +2327,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * Get a reference here because callers of *hva_to_pfn* and * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will + * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will * simply do nothing for reserved pfns. * * Whoever called remap_pfn_range is also going to call e.g. @@ -2636,13 +2724,6 @@ void kvm_set_pfn_accessed(kvm_pfn_t pfn) } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); -void kvm_get_pfn(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - get_page(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_get_pfn); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) @@ -3122,13 +3203,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_success_hist, + ktime_to_ns(ktime_get()) - + ktime_to_ns(start)); goto out; } cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_fail_hist, + ktime_to_ns(ktime_get()) - ktime_to_ns(start)); } + prepare_to_rcuwait(&vcpu->wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -3141,6 +3232,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } finish_rcuwait(&vcpu->wait); cur = ktime_get(); + if (waited) { + vcpu->stat.generic.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(poll_end); + KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(poll_end)); + } out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); @@ -3612,7 +3709,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) @@ -3822,7 +3919,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -3888,7 +3985,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; - if (dev->kvm->mm != current->mm) + if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -4210,7 +4307,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -4421,7 +4518,7 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT @@ -4983,12 +5080,12 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, struct kvm_stat_data *stat_data = (struct kvm_stat_data *) inode->i_private; - /* The debugfs files are a reference to the kvm struct which - * is still valid when kvm_destroy_vm is called. - * To avoid the race between open and the removal of the debugfs - * directory we test against the users count. + /* + * The debugfs files are a reference to the kvm struct which + * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe + * avoids the race between open and the removal of the debugfs directory. */ - if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; if (simple_attr_open(inode, file, get, |