diff options
59 files changed, 1372 insertions, 305 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7b83b176c662..c664064f76fb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -835,11 +835,13 @@ struct kvm_clock_data { Capability: KVM_CAP_VCPU_EVENTS Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86 -Type: vm ioctl +Architectures: x86, arm, arm64 +Type: vcpu ioctl Parameters: struct kvm_vcpu_event (out) Returns: 0 on success, -1 on error +X86: + Gets currently pending exceptions, interrupts, and NMIs as well as related states of the vcpu. @@ -881,15 +883,64 @@ Only two fields are defined in the flags field: - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that smi contains a valid state. +ARM/ARM64: + +If the guest accesses a device that is being emulated by the host kernel in +such a way that a real device would generate a physical SError, KVM may make +a virtual SError pending for that VCPU. This system error interrupt remains +pending until the guest takes the exception by unmasking PSTATE.A. + +Running the VCPU may cause it to take a pending SError, or make an access that +causes an SError to become pending. The event's description is only valid while +the VPCU is not running. + +This API provides a way to read and write the pending 'event' state that is not +visible to the guest. To save, restore or migrate a VCPU the struct representing +the state can be read then written using this GET/SET API, along with the other +guest-visible registers. It is not possible to 'cancel' an SError that has been +made pending. + +A device being emulated in user-space may also wish to generate an SError. To do +this the events structure can be populated by user-space. The current state +should be read first, to ensure no existing SError is pending. If an existing +SError is pending, the architecture's 'Multiple SError interrupts' rules should +be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and +Serviceability (RAS) Specification"). + +SError exceptions always have an ESR value. Some CPUs have the ability to +specify what the virtual SError's ESR value should be. These systems will +advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will +always have a non-zero value when read, and the agent making an SError pending +should specify the ISS field in the lower 24 bits of exception.serror_esr. If +the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events +with exception.has_esr as zero, KVM will choose an ESR. + +Specifying exception.has_esr on a system that does not support it will return +-EINVAL. Setting anything other than the lower 24bits of exception.serror_esr +will return -EINVAL. + +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + 4.32 KVM_SET_VCPU_EVENTS Capability: KVM_CAP_VCPU_EVENTS Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86 -Type: vm ioctl +Architectures: x86, arm, arm64 +Type: vcpu ioctl Parameters: struct kvm_vcpu_event (in) Returns: 0 on success, -1 on error +X86: + Set pending exceptions, interrupts, and NMIs as well as related states of the vcpu. @@ -910,6 +961,13 @@ shall be written into the VCPU. KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. +ARM/ARM64: + +Set the pending SError exception state for this VCPU. It is not possible to +'cancel' an Serror that has been made pending. + +See KVM_GET_VCPU_EVENTS for the data structure. + 4.33 KVM_GET_DEBUGREGS @@ -4690,3 +4748,17 @@ This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush hypercalls: HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. + +8.19 KVM_CAP_ARM_INJECT_SERROR_ESR + +Architectures: arm, arm64 + +This capability indicates that userspace can specify (via the +KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it +takes a virtual SError interrupt exception. +If KVM advertises this capability, userspace can only specify the ISS field for +the ESR syndrome. Other parts of the ESR, such as the EC are generated by the +CPU when the exception is taken. If this virtual SError is taken to EL1 using +AArch64, this value will be reported in the ISS field of ESR_ELx. + +See KVM_CAP_VCPU_EVENTS for more details. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt index 2408ab720ef7..ff290b43c8e5 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt @@ -100,6 +100,14 @@ Groups: Note that distributor fields are not banked, but return the same value regardless of the mpidr used to access the register. + GICD_IIDR.Revision is updated when the KVM implementation is changed in a + way directly observable by the guest or userspace. Userspace should read + GICD_IIDR from KVM and write back the read value to confirm its expected + behavior is aligned with the KVM implementation. Userspace should set + GICD_IIDR before setting any other registers to ensure the expected + behavior. + + The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such that a write of a clear bit has no effect, whereas a write with a set bit clears that value. To allow userspace to freely set the values of these two diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index b3ce12643553..97b6518148f8 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -49,9 +49,15 @@ Groups: index is specified with the vcpu_index field. Note that most distributor fields are not banked, but return the same value regardless of the vcpu_index used to access the register. - Limitations: - - Priorities are not implemented, and registers are RAZ/WI - - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. + + GICD_IIDR.Revision is updated when the KVM implementation of an emulated + GICv2 is changed in a way directly observable by the guest or userspace. + Userspace should read GICD_IIDR from KVM and write back the read value to + confirm its expected behavior is aligned with the KVM implementation. + Userspace should set GICD_IIDR before setting any other registers (both + KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure + the expected behavior. Unless GICD_IIDR has been set from userspace, writes + to the interrupt group registers (GICD_IGROUPR) are ignored. Errors: -ENXIO: Getting or setting this register is not yet supported -EBUSY: One or more VCPUs are running @@ -94,9 +100,6 @@ Groups: use the lower 5 bits to communicate with the KVM device and must shift the value left by 3 places to obtain the actual priority mask level. - Limitations: - - Priorities are not implemented, and registers are RAZ/WI - - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. Errors: -ENXIO: Getting or setting this register is not yet supported -EBUSY: One or more VCPUs are running diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index fe2fb1ddd771..77121b713bef 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -107,9 +107,19 @@ static inline unsigned long *vcpu_hcr(const struct kvm_vcpu *vcpu) return (unsigned long *)&vcpu->arch.hcr; } +static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr &= ~HCR_TWE; +} + +static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr |= HCR_TWE; +} + static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) { - return 1; + return true; } static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 1f1fe4109b02..79906cecb091 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -216,6 +216,11 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); +int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); + +int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 8553d68b7c8a..265ea9cf7df7 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -75,17 +75,9 @@ phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); -static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) -{ - *pmd = new_pmd; - dsb(ishst); -} - -static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) -{ - *pte = new_pte; - dsb(ishst); -} +#define kvm_mk_pmd(ptep) __pmd(__pa(ptep) | PMD_TYPE_TABLE) +#define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) +#define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 16e006f708ca..4602464ebdfb 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -27,6 +27,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -125,6 +126,18 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + /* If you need to interpret the index values, here is the key: */ #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 #define KVM_REG_ARM_COPROC_SHIFT 16 diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 3a02e76699a6..450c7a4fbc8a 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -246,6 +246,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, const struct coproc_reg *r) { u64 reg; + bool g1; if (!p->is_write) return read_from_write_only(vcpu, p); @@ -253,7 +254,25 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; reg |= *vcpu_reg(vcpu, p->Rt1) ; - vgic_v3_dispatch_sgi(vcpu, reg); + /* + * In a system where GICD_CTLR.DS=1, a ICC_SGI0R access generates + * Group0 SGIs only, while ICC_SGI1R can generate either group, + * depending on the SGI configuration. ICC_ASGI1R is effectively + * equivalent to ICC_SGI0R, as there is no "alternative" secure + * group. + */ + switch (p->Op1) { + default: /* Keep GCC quiet */ + case 0: /* ICC_SGI1R */ + g1 = true; + break; + case 1: /* ICC_ASGI1R */ + case 2: /* ICC_SGI0R */ + g1 = false; + break; + } + + vgic_v3_dispatch_sgi(vcpu, reg, g1); return true; } @@ -459,6 +478,10 @@ static const struct coproc_reg cp15_regs[] = { /* ICC_SGI1R */ { CRm64(12), Op1( 0), is64, access_gic_sgi}, + /* ICC_ASGI1R */ + { CRm64(12), Op1( 1), is64, access_gic_sgi}, + /* ICC_SGI0R */ + { CRm64(12), Op1( 2), is64, access_gic_sgi}, /* VBAR: swapped by interrupt.S. */ { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32, diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index a18f33edc471..2b8de885b2bf 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -261,6 +261,29 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return -EINVAL; } + +int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + events->exception.serror_pending = !!(*vcpu_hcr(vcpu) & HCR_VA); + + return 0; +} + +int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + bool serror_pending = events->exception.serror_pending; + bool has_esr = events->exception.serror_has_esr; + + if (serror_pending && has_esr) + return -EINVAL; + else if (serror_pending) + kvm_inject_vabt(vcpu); + + return 0; +} + int __attribute_const__ kvm_target_cpu(void) { switch (read_cpuid_part()) { diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index be3bf3d08916..ae1f70450fb2 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -50,7 +50,8 @@ #define ARM64_HW_DBM 29 #define ARM64_SSBD 30 #define ARM64_MISMATCHED_CACHE_TYPE 31 +#define ARM64_HAS_STAGE2_FWB 32 -#define ARM64_NCAPS 32 +#define ARM64_NCAPS 33 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 6dd285e979c9..aa45df752a16 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -23,6 +23,7 @@ #include <asm/types.h> /* Hyp Configuration Register (HCR) bits */ +#define HCR_FWB (UL(1) << 46) #define HCR_TEA (UL(1) << 37) #define HCR_TERR (UL(1) << 36) #define HCR_TLOR (UL(1) << 35) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0c97e45d1dc3..6106a85ae0be 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -63,6 +63,8 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) /* trap error record accesses */ vcpu->arch.hcr_el2 |= HCR_TERR; } + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) vcpu->arch.hcr_el2 &= ~HCR_RW; @@ -81,6 +83,21 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) return (unsigned long *)&vcpu->arch.hcr_el2; } +static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 &= ~HCR_TWE; +} + +static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 |= HCR_TWE; +} + +static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.vsesr_el2; +} + static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr) { vcpu->arch.vsesr_el2 = vsesr; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fe8777b12f86..f26055f2306e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -350,6 +350,11 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); +int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); + +int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); @@ -378,16 +383,23 @@ void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int kvm_perf_init(void); int kvm_perf_teardown(void); +void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); + struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -void __kvm_set_tpidr_el2(u64 tpidr_el2); DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { - u64 tpidr_el2; + /* + * Calculate the raw per-cpu offset without a translation from the + * kernel's mapping to the linear mapping, and store it in tpidr_el2 + * so that we can use adr_l to access per-cpu variables in EL2. + */ + u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_cpu_state) - + (u64)kvm_ksym_ref(kvm_host_cpu_state)); /* * Call initialization code, and switch to the full blown HYP code. @@ -396,17 +408,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, * cpus_have_const_cap() wrapper. */ BUG_ON(!static_branch_likely(&arm64_const_caps_ready)); - __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr); - - /* - * Calculate the raw per-cpu offset without a translation from the - * kernel's mapping to the linear mapping, and store it in tpidr_el2 - * so that we can use adr_l to access per-cpu variables in EL2. - */ - tpidr_el2 = (u64)this_cpu_ptr(&kvm_host_cpu_state) - - (u64)kvm_ksym_ref(kvm_host_cpu_state); - - kvm_call_hyp(__kvm_set_tpidr_el2, tpidr_el2); + __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); } static inline bool kvm_arch_check_sve_has_vhe(void) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index fb9a7127bb75..d6fff7de5539 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -169,8 +169,12 @@ phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); -#define kvm_set_pte(ptep, pte) set_pte(ptep, pte) -#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) +#define kvm_mk_pmd(ptep) \ + __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) +#define kvm_mk_pud(pmdp) \ + __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) +#define kvm_mk_pgd(pudp) \ + __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { @@ -267,6 +271,15 @@ static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) { void *va = page_address(pfn_to_page(pfn)); + /* + * With FWB, we ensure that the guest always accesses memory using + * cacheable attributes, and we don't have to clean to PoC when + * faulting in pages. Furthermore, FWB implies IDC, so cleaning to + * PoU is not required either in this case. + */ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return; + kvm_flush_dcache_to_poc(va, size); } @@ -287,20 +300,26 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, static inline void __kvm_flush_dcache_pte(pte_t pte) { - struct page *page = pte_page(pte); - kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE); + if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { + struct page *page = pte_page(pte); + kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE); + } } static inline void __kvm_flush_dcache_pmd(pmd_t pmd) { - struct page *page = pmd_page(pmd); - kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE); + if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { + struct page *page = pmd_page(pmd); + kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE); + } } static inline void __kvm_flush_dcache_pud(pud_t pud) { - struct page *page = pud_page(pud); - kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE); + if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { + struct page *page = pud_page(pud); + kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE); + } } #define kvm_virt_to_phys(x) __pa_symbol(x) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 49d99214f43c..b96442960aea 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -155,6 +155,13 @@ #define MT_S2_NORMAL 0xf #define MT_S2_DEVICE_nGnRE 0x1 +/* + * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001 + * Stage-2 enforces Normal-WB and Device-nGnRE + */ +#define MT_S2_FWB_NORMAL 6 +#define MT_S2_FWB_DEVICE_nGnRE 1 + #ifdef CONFIG_ARM64_4K_PAGES #define IOREMAP_MAX_ORDER (PUD_SHIFT) #else diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 108ecad7acc5..78b942c1bea4 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -67,8 +67,28 @@ #define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) -#define PAGE_S2 __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY | PTE_S2_XN) -#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) +#define PAGE_S2_MEMATTR(attr) \ + ({ \ + u64 __val; \ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) \ + __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ + else \ + __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ + __val; \ + }) + +#define PAGE_S2_XN \ + ({ \ + u64 __val; \ + if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \ + __val = 0; \ + else \ + __val = PTE_S2_XN; \ + __val; \ + }) + +#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN) +#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PAGE_S2_XN) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index e205ec8489e9..c1470931b897 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -314,6 +314,8 @@ #define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) #define SYS_ICC_RPR_EL1 sys_reg(3, 0, 12, 11, 3) #define SYS_ICC_SGI1R_EL1 sys_reg(3, 0, 12, 11, 5) +#define SYS_ICC_ASGI1R_EL1 sys_reg(3, 0, 12, 11, 6) +#define SYS_ICC_SGI0R_EL1 sys_reg(3, 0, 12, 11, 7) #define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) #define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) #define SYS_ICC_HPPIR1_EL1 sys_reg(3, 0, 12, 12, 2) @@ -579,6 +581,7 @@ #define ID_AA64MMFR1_VMIDBITS_16 2 /* id_aa64mmfr2 */ +#define ID_AA64MMFR2_FWB_SHIFT 40 #define ID_AA64MMFR2_AT_SHIFT 32 #define ID_AA64MMFR2_LVA_SHIFT 16 #define ID_AA64MMFR2_IESB_SHIFT 12 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 4e76630dd655..97c3478ee6e7 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -39,6 +39,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -154,6 +155,18 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + /* If you need to interpret the index values, here is the key: */ #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 #define KVM_REG_ARM_COPROC_SHIFT 16 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 611e8921c3d4..e238b7932096 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -192,6 +192,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), @@ -1026,6 +1027,14 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused) } #endif +static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused) +{ + u64 val = read_sysreg_s(SYS_CLIDR_EL1); + + /* Check that CLIDR_EL1.LOU{U,IS} are both 0 */ + WARN_ON(val & (7 << 27 | 7 << 21)); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -1182,6 +1191,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cache_dic, }, + { + .desc = "Stage-2 Force Write-Back", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_STAGE2_FWB, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_FWB_SHIFT, + .min_field_value = 1, + .matches = has_cpuid_feature, + .cpu_enable = cpu_has_fwb, + }, #ifdef CONFIG_ARM64_HW_AFDBM { /* diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index cdd4d9d6d575..07256b08226c 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -289,6 +289,39 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return -EINVAL; } +int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE); + events->exception.serror_has_esr = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); + + if (events->exception.serror_pending && events->exception.serror_has_esr) + events->exception.serror_esr = vcpu_get_vsesr(vcpu); + + return 0; +} + +int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + bool serror_pending = events->exception.serror_pending; + bool has_esr = events->exception.serror_has_esr; + + if (serror_pending && has_esr) { + if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) + return -EINVAL; + + if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK)) + kvm_set_sei_esr(vcpu, events->exception.serror_esr); + else + return -EINVAL; + } else if (serror_pending) { + kvm_inject_vabt(vcpu); + } + + return 0; +} + int __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 6fd91b31a131..ea9225160786 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -57,6 +57,7 @@ __invalid: * x0: HYP pgd * x1: HYP stack * x2: HYP vectors + * x3: per-CPU offset */ __do_hyp_init: /* Check for a stub HVC call */ @@ -119,9 +120,8 @@ CPU_BE( orr x4, x4, #SCTLR_ELx_EE) mov sp, x1 msr vbar_el2, x2 - /* copy tpidr_el1 into tpidr_el2 for use by HYP */ - mrs x1, tpidr_el1 - msr tpidr_el2, x1 + /* Set tpidr_el2 for use by HYP */ + msr tpidr_el2, x3 /* Hello, World! */ eret diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 35bc16832efe..9ce223944983 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -288,8 +288,3 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) vcpu->arch.sysregs_loaded_on_cpu = false; } - -void __hyp_text __kvm_set_tpidr_el2(u64 tpidr_el2) -{ - asm("msr tpidr_el2, %0": : "r" (tpidr_el2)); -} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index d8e71659ba7e..a55e91dfcf8f 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -164,9 +164,9 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) inject_undef64(vcpu); } -static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr) +void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr) { - vcpu_set_vsesr(vcpu, esr); + vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK); *vcpu_hcr(vcpu) |= HCR_VSE; } @@ -184,5 +184,5 @@ static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr) */ void kvm_inject_vabt(struct kvm_vcpu *vcpu) { - pend_guest_serror(vcpu, ESR_ELx_ISV); + kvm_set_sei_esr(vcpu, ESR_ELx_ISV); } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 4e4aedaf7ab7..e37c78bbe1ca 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -77,8 +77,12 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PMU_V3: r = kvm_arm_support_pmu_v3(); break; + case KVM_CAP_ARM_INJECT_SERROR_ESR: + r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN); + break; case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_VCPU_EVENTS: r = 1; break; default: diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a4363735d3f8..22fbbdbece3c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -194,7 +194,16 @@ static bool access_dcsw(struct kvm_vcpu *vcpu, if (!p->is_write) return read_from_write_only(vcpu, p, r); - kvm_set_way_flush(vcpu); + /* + * Only track S/W ops if we don't have FWB. It still indicates + * that the guest is a bit broken (S/W operations should only + * be done by firmware, knowing that there is only a single + * CPU left in the system, and certainly not from non-secure + * software). + */ + if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + kvm_set_way_flush(vcpu); + return true; } @@ -243,10 +252,43 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + bool g1; + if (!p->is_write) return read_from_write_only(vcpu, p, r); - vgic_v3_dispatch_sgi(vcpu, p->regval); + /* + * In a system where GICD_CTLR.DS=1, a ICC_SGI0R_EL1 access generates + * Group0 SGIs only, while ICC_SGI1R_EL1 can generate either group, + * depending on the SGI configuration. ICC_ASGI1R_EL1 is effectively + * equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure + * group. + */ + if (p->is_aarch32) { + switch (p->Op1) { + default: /* Keep GCC quiet */ + case 0: /* ICC_SGI1R */ + g1 = true; + break; + case 1: /* ICC_ASGI1R */ + case 2: /* ICC_SGI0R */ + g1 = false; + break; + } + } else { + switch (p->Op2) { + default: /* Keep GCC quiet */ + case 5: /* ICC_SGI1R_EL1 */ + g1 = true; + break; + case 6: /* ICC_ASGI1R_EL1 */ + case 7: /* ICC_SGI0R_EL1 */ + g1 = false; + break; + } + } + + vgic_v3_dispatch_sgi(vcpu, p->regval, g1); return true; } @@ -1303,6 +1345,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, @@ -1613,8 +1657,6 @@ static const struct sys_reg_desc cp14_64_regs[] = { * register). */ static const struct sys_reg_desc cp15_regs[] = { - { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, - { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, @@ -1737,8 +1779,10 @@ static const struct sys_reg_desc cp15_regs[] = { static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr }, - { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, + { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, + { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval }, }; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 95f9107449bf..9527ba5d62da 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -74,6 +74,7 @@ #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -213,6 +214,8 @@ enum vmcs_field { VMWRITE_BITMAP_HIGH = 0x00002029, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + ENCLS_EXITING_BITMAP = 0x0000202E, + ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 73e27a98456f..6276140044d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5586,8 +5586,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); - local_irq_enable(); - /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there @@ -5596,6 +5594,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + local_irq_enable(); + asm volatile ( "push %%" _ASM_BP "; \n\t" "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" @@ -5718,12 +5718,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - reload_tss(vcpu); local_irq_disable(); + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1519f030fd73..8dae47e7267a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -198,12 +198,14 @@ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_ static const struct { const char *option; - enum vmx_l1d_flush_state cmd; + bool for_parse; } vmentry_l1d_param[] = { - {"auto", VMENTER_L1D_FLUSH_AUTO}, - {"never", VMENTER_L1D_FLUSH_NEVER}, - {"cond", VMENTER_L1D_FLUSH_COND}, - {"always", VMENTER_L1D_FLUSH_ALWAYS}, + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, + [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, + [VMENTER_L1D_FLUSH_COND] = {"cond", true}, + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, }; #define L1D_CACHE_ORDER 4 @@ -219,15 +221,15 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } - } + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } /* If set to auto use the default l1tf mitigation method */ if (l1tf == VMENTER_L1D_FLUSH_AUTO) { @@ -287,8 +289,9 @@ static int vmentry_l1d_flush_parse(const char *s) if (s) { for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { - if (sysfs_streq(s, vmentry_l1d_param[i].option)) - return vmentry_l1d_param[i].cmd; + if (vmentry_l1d_param[i].for_parse && + sysfs_streq(s, vmentry_l1d_param[i].option)) + return i; } } return -EINVAL; @@ -298,13 +301,13 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) { int l1tf, ret; - if (!boot_cpu_has(X86_BUG_L1TF)) - return 0; - l1tf = vmentry_l1d_flush_parse(s); if (l1tf < 0) return l1tf; + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + /* * Has vmx_init() run already? If not then this is the pre init * parameter parsing. In that case just store the value and let @@ -324,6 +327,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) + return sprintf(s, "???\n"); + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } @@ -1684,6 +1690,12 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_encls_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENCLS_EXITING; +} + /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -4551,7 +4563,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_VMFUNC; + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -6648,6 +6661,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9314,6 +9330,17 @@ fail: return 1; } +static int handle_encls(struct kvm_vcpu *vcpu) +{ + /* + * SGX virtualization is not yet supported. There is no software + * enable bit for SGX, so we have to trap ENCLS and inject a #UD + * to prevent the guest from executing ENCLS. + */ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -9371,6 +9398,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, + [EXIT_REASON_ENCLS] = handle_encls, }; static const int kvm_vmx_max_exit_handlers = @@ -9741,6 +9769,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return false; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return false; default: return true; } @@ -12101,6 +12132,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) vmcs_write64(APIC_ACCESS_ADDR, -1ull); + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a74a7cf0a8b..506bd2b4b8bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6576,14 +6576,12 @@ static void kvm_set_mmio_spte_mask(void) /* Set the present bit. */ mask |= 1ull; -#ifdef CONFIG_X86_64 /* * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (maxphyaddr == 52) + if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) mask &= ~1ull; -#endif kvm_mmu_set_mmio_spte_mask(mask, mask); } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index cfdd2484cc42..4f31f96bbfab 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -133,6 +133,7 @@ struct vgic_irq { u8 source; /* GICv2 SGIs only */ u8 active_source; /* GICv2 SGIs only */ u8 priority; + u8 group; /* 0 == group 0, 1 == group 1 */ enum vgic_irq_config config; /* Level or edge */ /* @@ -217,6 +218,12 @@ struct vgic_dist { /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ u32 vgic_model; + /* Implementation revision as reported in the GICD_IIDR */ + u32 implementation_rev; + + /* Userspace can write to GICv2 IGROUPR */ + bool v2_groups_user_writable; + /* Do injected MSIs require an additional device ID? */ bool msis_require_devid; @@ -366,7 +373,7 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); /** * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 9d2ea3e907d0..8bdbb5f29494 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -61,6 +61,16 @@ #define GICD_CTLR_ENABLE_G1A (1U << 1) #define GICD_CTLR_ENABLE_G1 (1U << 0) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + /* * In systems with a single security state (what we emulate in KVM) * the meaning of the interrupt group enable bits is slightly different diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 68d8b1f73682..6c4aaf04046c 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -71,6 +71,16 @@ (GICD_INT_DEF_PRI << 8) |\ GICD_INT_DEF_PRI) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + #define GICH_HCR 0x0 #define GICH_VTR 0x4 #define GICH_VMCR 0x8 @@ -94,6 +104,7 @@ #define GICH_LR_PENDING_BIT (1 << 28) #define GICH_LR_ACTIVE_BIT (1 << 29) #define GICH_LR_EOI (1 << 19) +#define GICH_LR_GROUP1 (1 << 30) #define GICH_LR_HW (1 << 31) #define GICH_VMCR_ENABLE_GRP0_SHIFT 0 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3cf632839337..07548de5c988 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -951,6 +951,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_TLBFLUSH 155 #define KVM_CAP_S390_HPAGE_1M 156 #define KVM_CAP_NESTED_STATE 157 +#define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 63440cc8d618..e63662db131b 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -97,6 +97,23 @@ static inline int test_and_set_bit(int nr, unsigned long *addr) } /** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + */ +static inline int test_and_clear_bit(int nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old; + + old = *p; + *p = old & ~mask; + + return (old & mask) != 0; +} + +/** * bitmap_alloc - Allocate bitmap * @nbits: Number of bits */ diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index dd0e5163f01f..03b0f551bedf 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -11,13 +11,16 @@ TEST_GEN_PROGS_x86_64 += sync_regs_test TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += state_test +TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ -CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I.. +LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include +CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I.. +LDFLAGS += -lpthread # After inclusion, $(OUTPUT) is defined and # $(TEST_GEN_PROGS) starts with $(OUTPUT)/ diff --git a/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c index 8346b33c2073..11ec358bf969 100644 --- a/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c @@ -23,20 +23,6 @@ #define X86_FEATURE_OSXSAVE (1<<27) #define VCPU_ID 1 -enum { - GUEST_UPDATE_CR4 = 0x1000, - GUEST_FAILED, - GUEST_DONE, -}; - -static void exit_to_hv(uint16_t port) -{ - __asm__ __volatile__("in %[port], %%al" - : - : [port]"d"(port) - : "rax"); -} - static inline bool cr4_cpuid_is_sync(void) { int func, subfunc; @@ -64,17 +50,15 @@ static void guest_code(void) set_cr4(cr4); /* verify CR4.OSXSAVE == CPUID.OSXSAVE */ - if (!cr4_cpuid_is_sync()) - exit_to_hv(GUEST_FAILED); + GUEST_ASSERT(cr4_cpuid_is_sync()); /* notify hypervisor to change CR4 */ - exit_to_hv(GUEST_UPDATE_CR4); + GUEST_SYNC(0); /* check again */ - if (!cr4_cpuid_is_sync()) - exit_to_hv(GUEST_FAILED); + GUEST_ASSERT(cr4_cpuid_is_sync()); - exit_to_hv(GUEST_DONE); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -95,7 +79,7 @@ int main(int argc, char *argv[]) setbuf(stdout, NULL); /* Create VM */ - vm = vm_create_default(VCPU_ID, guest_code); + vm = vm_create_default(VCPU_ID, 0, guest_code); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); run = vcpu_state(vm, VCPU_ID); @@ -104,16 +88,16 @@ int main(int argc, char *argv[]) if (run->exit_reason == KVM_EXIT_IO) { switch (run->io.port) { - case GUEST_UPDATE_CR4: + case GUEST_PORT_SYNC: /* emulate hypervisor clearing CR4.OSXSAVE */ vcpu_sregs_get(vm, VCPU_ID, &sregs); sregs.cr4 &= ~X86_CR4_OSXSAVE; vcpu_sregs_set(vm, VCPU_ID, &sregs); break; - case GUEST_FAILED: + case GUEST_PORT_ABORT: TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); break; - case GUEST_DONE: + case GUEST_PORT_DONE: goto done; default: TEST_ASSERT(false, "Unknown port 0x%x.", diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c new file mode 100644 index 000000000000..0c2cdc105f96 --- /dev/null +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <pthread.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include "test_util.h" +#include "kvm_util.h" + +#define DEBUG printf + +#define VCPU_ID 1 +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 +/* + * GPA offset of the testing memory slot. Must be bigger than the + * default vm mem slot, which is DEFAULT_GUEST_PHY_PAGES. + */ +#define TEST_MEM_OFFSET (1ULL << 30) /* 1G */ +/* Size of the testing memory slot */ +#define TEST_MEM_PAGES (1ULL << 18) /* 1G for 4K pages */ +/* How many pages to dirty for each guest loop */ +#define TEST_PAGES_PER_LOOP 1024 +/* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */ +#define TEST_HOST_LOOP_N 32 +/* Interval for each host loop (ms) */ +#define TEST_HOST_LOOP_INTERVAL 10 + +/* + * Guest variables. We use these variables to share data between host + * and guest. There are two copies of the variables, one in host memory + * (which is unused) and one in guest memory. When the host wants to + * access these variables, it needs to call addr_gva2hva() to access the + * guest copy. + */ +uint64_t guest_random_array[TEST_PAGES_PER_LOOP]; +uint64_t guest_iteration; +uint64_t guest_page_size; + +/* + * Writes to the first byte of a random page within the testing memory + * region continuously. + */ +void guest_code(void) +{ + int i = 0; + uint64_t volatile *array = guest_random_array; + uint64_t volatile *guest_addr; + + while (true) { + for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { + /* + * Write to the first 8 bytes of a random page + * on the testing memory region. + */ + guest_addr = (uint64_t *) + (TEST_MEM_OFFSET + + (array[i] % TEST_MEM_PAGES) * guest_page_size); + *guest_addr = guest_iteration; + } + /* Tell the host that we need more random numbers */ + GUEST_SYNC(1); + } +} + +/* + * Host variables. These variables should only be used by the host + * rather than the guest. + */ +bool host_quit; + +/* Points to the test VM memory region on which we track dirty logs */ +void *host_test_mem; + +/* For statistics only */ +uint64_t host_dirty_count; +uint64_t host_clear_count; +uint64_t host_track_next_count; + +/* + * We use this bitmap to track some pages that should have its dirty + * bit set in the _next_ iteration. For example, if we detected the + * page value changed to current iteration but at the same time the + * page bit is cleared in the latest bitmap, then the system must + * report that write in the next get dirty log call. + */ +unsigned long *host_bmap_track; + +void generate_random_array(uint64_t *guest_array, uint64_t size) +{ + uint64_t i; + + for (i = 0; i < size; i++) { + guest_array[i] = random(); + } +} + +void *vcpu_worker(void *data) +{ + int ret; + uint64_t loops, *guest_array, pages_count = 0; + struct kvm_vm *vm = data; + struct kvm_run *run; + struct guest_args args; + + run = vcpu_state(vm, VCPU_ID); + + /* Retrieve the guest random array pointer and cache it */ + guest_array = addr_gva2hva(vm, (vm_vaddr_t)guest_random_array); + + DEBUG("VCPU starts\n"); + + generate_random_array(guest_array, TEST_PAGES_PER_LOOP); + + while (!READ_ONCE(host_quit)) { + /* Let the guest to dirty these random pages */ + ret = _vcpu_run(vm, VCPU_ID); + guest_args_read(vm, VCPU_ID, &args); + if (run->exit_reason == KVM_EXIT_IO && + args.port == GUEST_PORT_SYNC) { + pages_count += TEST_PAGES_PER_LOOP; + generate_random_array(guest_array, TEST_PAGES_PER_LOOP); + } else { + TEST_ASSERT(false, + "Invalid guest sync status: " + "exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + } + } + + DEBUG("VCPU exits, dirtied %"PRIu64" pages\n", pages_count); + + return NULL; +} + +void vm_dirty_log_verify(unsigned long *bmap, uint64_t iteration) +{ + uint64_t page; + uint64_t volatile *value_ptr; + + for (page = 0; page < TEST_MEM_PAGES; page++) { + value_ptr = host_test_mem + page * getpagesize(); + + /* If this is a special page that we were tracking... */ + if (test_and_clear_bit(page, host_bmap_track)) { + host_track_next_count++; + TEST_ASSERT(test_bit(page, bmap), + "Page %"PRIu64" should have its dirty bit " + "set in this iteration but it is missing", + page); + } + + if (test_bit(page, bmap)) { + host_dirty_count++; + /* + * If the bit is set, the value written onto + * the corresponding page should be either the + * previous iteration number or the current one. + */ + TEST_ASSERT(*value_ptr == iteration || + *value_ptr == iteration - 1, + "Set page %"PRIu64" value %"PRIu64 + " incorrect (iteration=%"PRIu64")", + page, *value_ptr, iteration); + } else { + host_clear_count++; + /* + * If cleared, the value written can be any + * value smaller or equals to the iteration + * number. Note that the value can be exactly + * (iteration-1) if that write can happen + * like this: + * + * (1) increase loop count to "iteration-1" + * (2) write to page P happens (with value + * "iteration-1") + * (3) get dirty log for "iteration-1"; we'll + * see that page P bit is set (dirtied), + * and not set the bit in host_bmap_track + * (4) increase loop count to "iteration" + * (which is current iteration) + * (5) get dirty log for current iteration, + * we'll see that page P is cleared, with + * value "iteration-1". + */ + TEST_ASSERT(*value_ptr <= iteration, + "Clear page %"PRIu64" value %"PRIu64 + " incorrect (iteration=%"PRIu64")", + page, *value_ptr, iteration); + if (*value_ptr == iteration) { + /* + * This page is _just_ modified; it + * should report its dirtyness in the + * next run + */ + set_bit(page, host_bmap_track); + } + } + } +} + +void help(char *name) +{ + puts(""); + printf("usage: %s [-i iterations] [-I interval] [-h]\n", name); + puts(""); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", + TEST_HOST_LOOP_INTERVAL); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + pthread_t vcpu_thread; + struct kvm_vm *vm; + uint64_t volatile *psize, *iteration; + unsigned long *bmap, iterations = TEST_HOST_LOOP_N, + interval = TEST_HOST_LOOP_INTERVAL; + int opt; + + while ((opt = getopt(argc, argv, "hi:I:")) != -1) { + switch (opt) { + case 'i': + iterations = strtol(optarg, NULL, 10); + break; + case 'I': + interval = strtol(optarg, NULL, 10); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_ASSERT(iterations > 2, "Iteration must be bigger than zero\n"); + TEST_ASSERT(interval > 0, "Interval must be bigger than zero"); + + DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", + iterations, interval); + + srandom(time(0)); + + bmap = bitmap_alloc(TEST_MEM_PAGES); + host_bmap_track = bitmap_alloc(TEST_MEM_PAGES); + + vm = vm_create_default(VCPU_ID, TEST_MEM_PAGES, guest_code); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + TEST_MEM_OFFSET, + TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, + KVM_MEM_LOG_DIRTY_PAGES); + /* Cache the HVA pointer of the region */ + host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)TEST_MEM_OFFSET); + + /* Do 1:1 mapping for the dirty track memory slot */ + virt_map(vm, TEST_MEM_OFFSET, TEST_MEM_OFFSET, + TEST_MEM_PAGES * getpagesize(), 0); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* Tell the guest about the page size on the system */ + psize = addr_gva2hva(vm, (vm_vaddr_t)&guest_page_size); + *psize = getpagesize(); + + /* Start the iterations */ + iteration = addr_gva2hva(vm, (vm_vaddr_t)&guest_iteration); + *iteration = 1; + + /* Start dirtying pages */ + pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); + + while (*iteration < iterations) { + /* Give the vcpu thread some time to dirty some pages */ + usleep(interval * 1000); + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + vm_dirty_log_verify(bmap, *iteration); + (*iteration)++; + } + + /* Tell the vcpu thread to quit */ + host_quit = true; + pthread_join(vcpu_thread, NULL); + + DEBUG("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " + "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, + host_track_next_count); + + free(bmap); + free(host_bmap_track); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d32632f71ab8..bb5a25fb82c6 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -55,6 +55,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); +void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); @@ -80,6 +81,8 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, uint32_t data_memslot, uint32_t pgd_memslot); +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + size_t size, uint32_t pgd_memslot); void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); @@ -127,7 +130,8 @@ kvm_get_supported_cpuid_entry(uint32_t function) return kvm_get_supported_cpuid_index(function, 0); } -struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code); +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size, + void *guest_code); void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); typedef void (*vmx_guest_code_t)(vm_vaddr_t vmxon_vaddr, @@ -144,4 +148,43 @@ allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region); int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); +#define GUEST_PORT_SYNC 0x1000 +#define GUEST_PORT_ABORT 0x1001 +#define GUEST_PORT_DONE 0x1002 + +static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1) +{ + __asm__ __volatile__("in %[port], %%al" + : + : [port]"d"(port), "D"(arg0), "S"(arg1) + : "rax"); +} + +/* + * Allows to pass three arguments to the host: port is 16bit wide, + * arg0 & arg1 are 64bit wide + */ +#define GUEST_SYNC_ARGS(_port, _arg0, _arg1) \ + __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1)) + +#define GUEST_ASSERT(_condition) do { \ + if (!(_condition)) \ + GUEST_SYNC_ARGS(GUEST_PORT_ABORT, \ + "Failed guest assert: " \ + #_condition, __LINE__); \ + } while (0) + +#define GUEST_SYNC(stage) GUEST_SYNC_ARGS(GUEST_PORT_SYNC, "hello", stage) + +#define GUEST_DONE() GUEST_SYNC_ARGS(GUEST_PORT_DONE, 0, 0) + +struct guest_args { + uint64_t arg0; + uint64_t arg1; + uint16_t port; +} __attribute__ ((packed)); + +void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id, + struct guest_args *args); + #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index ac53730b30aa..73c3933436ec 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -28,8 +28,6 @@ int test_seq_read(const char *path, char **bufp, size_t *sizep); void test_assert(bool exp, const char *exp_str, const char *file, unsigned int line, const char *fmt, ...); -#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0])) - #define TEST_ASSERT(e, fmt, ...) \ test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 643309d6de74..e9ba389c48db 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -14,6 +14,7 @@ #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> +#include <linux/kernel.h> #define KVM_DEV_PATH "/dev/kvm" @@ -168,6 +169,16 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) } } +void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + int ret; + + ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); + TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", + strerror(-ret)); +} + /* Userspace Memory Region Find * * Input Args: @@ -923,6 +934,39 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, return vaddr_start; } +/* + * Map a range of VM virtual address to the VM's physical address + * + * Input Args: + * vm - Virtual Machine + * vaddr - Virtuall address to map + * paddr - VM Physical Address + * size - The size of the range to map + * pgd_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a virtual translation for the + * page range starting at vaddr to the page range starting at paddr. + */ +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + size_t size, uint32_t pgd_memslot) +{ + size_t page_size = vm->page_size; + size_t npages = size / page_size; + + TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + virt_pg_map(vm, vaddr, paddr, pgd_memslot); + vaddr += page_size; + paddr += page_size; + } +} + /* Address VM Physical to Host Virtual * * Input Args: @@ -1536,3 +1580,17 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) { return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } + +void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id, + struct guest_args *args) +{ + struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_regs regs; + + memset(®s, 0, sizeof(regs)); + vcpu_regs_get(vm, vcpu_id, ®s); + + args->port = run->io.port; + args->arg0 = regs.rdi; + args->arg1 = regs.rsi; +} diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c index e38345252df5..a3122f1949a8 100644 --- a/tools/testing/selftests/kvm/lib/x86.c +++ b/tools/testing/selftests/kvm/lib/x86.c @@ -702,6 +702,9 @@ void vcpu_set_cpuid(struct kvm_vm *vm, * * Input Args: * vcpuid - The id of the single VCPU to add to the VM. + * extra_mem_pages - The size of extra memories to add (this will + * decide how much extra space we will need to + * setup the page tables using mem slot 0) * guest_code - The vCPU's entry point * * Output Args: None @@ -709,12 +712,23 @@ void vcpu_set_cpuid(struct kvm_vm *vm, * Return: * Pointer to opaque structure that describes the created VM. */ -struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code) +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) { struct kvm_vm *vm; + /* + * For x86 the maximum page table size for a memory region + * will be when only 4K pages are used. In that case the + * total extra size for page tables (for extra N pages) will + * be: N/512+N/512^2+N/512^3+... which is definitely smaller + * than N/512*2. + */ + uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; /* Create VM */ - vm = vm_create(VM_MODE_FLAT48PG, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_FLAT48PG, + DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, + O_RDWR); /* Setup guest code */ kvm_vm_elf_load(vm, program_invocation_name, 0, 0); diff --git a/tools/testing/selftests/kvm/set_sregs_test.c b/tools/testing/selftests/kvm/set_sregs_test.c index 090fd3f19352..881419d5746e 100644 --- a/tools/testing/selftests/kvm/set_sregs_test.c +++ b/tools/testing/selftests/kvm/set_sregs_test.c @@ -36,7 +36,7 @@ int main(int argc, char *argv[]) setbuf(stdout, NULL); /* Create VM */ - vm = vm_create_default(VCPU_ID, NULL); + vm = vm_create_default(VCPU_ID, 0, NULL); vcpu_sregs_get(vm, VCPU_ID, &sregs); sregs.apic_base = 1 << 10; diff --git a/tools/testing/selftests/kvm/state_test.c b/tools/testing/selftests/kvm/state_test.c index ecabf25b7077..900e3e9dfb9f 100644 --- a/tools/testing/selftests/kvm/state_test.c +++ b/tools/testing/selftests/kvm/state_test.c @@ -21,28 +21,6 @@ #include "vmx.h" #define VCPU_ID 5 -#define PORT_SYNC 0x1000 -#define PORT_ABORT 0x1001 -#define PORT_DONE 0x1002 - -static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1) -{ - __asm__ __volatile__("in %[port], %%al" - : - : [port]"d"(port), "D"(arg0), "S"(arg1) - : "rax"); -} - -#define exit_to_l0(_port, _arg0, _arg1) \ - __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1)) - -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, __LINE__);\ -} while (0) - -#define GUEST_SYNC(stage) \ - exit_to_l0(PORT_SYNC, "hello", stage); static bool have_nested_state; @@ -137,7 +115,7 @@ void guest_code(struct vmx_pages *vmx_pages) if (vmx_pages) l1_guest_code(vmx_pages); - exit_to_l0(PORT_DONE, 0, 0); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -154,7 +132,7 @@ int main(int argc, char *argv[]) struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); /* Create VM */ - vm = vm_create_default(VCPU_ID, guest_code); + vm = vm_create_default(VCPU_ID, 0, guest_code); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); run = vcpu_state(vm, VCPU_ID); @@ -178,13 +156,13 @@ int main(int argc, char *argv[]) memset(®s1, 0, sizeof(regs1)); vcpu_regs_get(vm, VCPU_ID, ®s1); switch (run->io.port) { - case PORT_ABORT: + case GUEST_PORT_ABORT: TEST_ASSERT(false, "%s at %s:%d", (const char *) regs1.rdi, __FILE__, regs1.rsi); /* NOT REACHED */ - case PORT_SYNC: + case GUEST_PORT_SYNC: break; - case PORT_DONE: + case GUEST_PORT_DONE: goto done; default: TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port); diff --git a/tools/testing/selftests/kvm/sync_regs_test.c b/tools/testing/selftests/kvm/sync_regs_test.c index eae1ece3c31b..213343e5dff9 100644 --- a/tools/testing/selftests/kvm/sync_regs_test.c +++ b/tools/testing/selftests/kvm/sync_regs_test.c @@ -22,28 +22,11 @@ #include "x86.h" #define VCPU_ID 5 -#define PORT_HOST_SYNC 0x1000 - -static void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1) -{ - __asm__ __volatile__("in %[port], %%al" - : - : [port]"d"(port), "D"(arg0), "S"(arg1) - : "rax"); -} - -#define exit_to_l0(_port, _arg0, _arg1) \ - __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1)) - -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, 0);\ -} while (0) void guest_code(void) { for (;;) { - exit_to_l0(PORT_HOST_SYNC, "hello", 0); + GUEST_SYNC(0); asm volatile ("inc %r11"); } } @@ -111,7 +94,7 @@ int main(int argc, char *argv[]) } /* Create VM */ - vm = vm_create_default(VCPU_ID, guest_code); + vm = vm_create_default(VCPU_ID, 0, guest_code); run = vcpu_state(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c index fc414c284368..49bcc68b0235 100644 --- a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c @@ -62,27 +62,12 @@ struct kvm_single_msr { /* The virtual machine object. */ static struct kvm_vm *vm; -#define exit_to_l0(_port, _arg) do_exit_to_l0(_port, (unsigned long) (_arg)) -static void do_exit_to_l0(uint16_t port, unsigned long arg) -{ - __asm__ __volatile__("in %[port], %%al" - : - : [port]"d"(port), "D"(arg) - : "rax"); -} - - -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition); \ -} while (0) - static void check_ia32_tsc_adjust(int64_t max) { int64_t adjust; adjust = rdmsr(MSR_IA32_TSC_ADJUST); - exit_to_l0(PORT_REPORT, adjust); + GUEST_SYNC(adjust); GUEST_ASSERT(adjust <= max); } @@ -132,7 +117,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); - exit_to_l0(PORT_DONE, 0); + GUEST_DONE(); } void report(int64_t val) @@ -152,7 +137,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, (void *) l1_guest_code); + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); /* Allocate VMX pages and shared descriptors (vmx_pages). */ @@ -161,26 +146,26 @@ int main(int argc, char *argv[]) for (;;) { volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); - struct kvm_regs regs; + struct guest_args args; vcpu_run(vm, VCPU_ID); - vcpu_regs_get(vm, VCPU_ID, ®s); + guest_args_read(vm, VCPU_ID, &args); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Got exit_reason other than KVM_EXIT_IO: %u (%s), rip=%lx\n", + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, - exit_reason_str(run->exit_reason), regs.rip); + exit_reason_str(run->exit_reason)); - switch (run->io.port) { - case PORT_ABORT: - TEST_ASSERT(false, "%s", (const char *) regs.rdi); + switch (args.port) { + case GUEST_PORT_ABORT: + TEST_ASSERT(false, "%s", (const char *) args.arg0); /* NOT REACHED */ - case PORT_REPORT: - report(regs.rdi); + case GUEST_PORT_SYNC: + report(args.arg1); break; - case PORT_DONE: + case GUEST_PORT_DONE: goto done; default: - TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port); + TEST_ASSERT(false, "Unknown port 0x%x.", args.port); } } diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index bd3d57f40f1b..17cecc96f735 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -295,9 +295,9 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* - * If the timer can fire now we have just raised the IRQ line and we - * don't need to have a soft timer scheduled for the future. If the - * timer cannot fire at all, then we also don't need a soft timer. + * If the timer can fire now, we don't need to have a soft timer + * scheduled for the future. If the timer cannot fire at all, + * then we also don't need a soft timer. */ if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { soft_timer_cancel(&timer->phys_timer, NULL); @@ -332,10 +332,10 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) level = kvm_timer_should_fire(vtimer); kvm_timer_update_irq(vcpu, level, vtimer); + phys_timer_emulate(vcpu); + if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); - - phys_timer_emulate(vcpu); } static void vtimer_save_state(struct kvm_vcpu *vcpu) @@ -487,6 +487,7 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); if (unlikely(!timer->enabled)) return; @@ -502,6 +503,10 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) /* Set the background timer for the physical timer emulation. */ phys_timer_emulate(vcpu); + + /* If the timer fired while we weren't running, inject it now */ + if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) + kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 108250e4d376..c92053bc3f96 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -30,6 +30,7 @@ #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> +#include <linux/sched/stat.h> #include <trace/events/kvm.h> #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> @@ -380,6 +381,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_timer_vcpu_load(vcpu); kvm_vcpu_load_sysregs(vcpu); kvm_arch_vcpu_load_fp(vcpu); + + if (single_task_running()) + vcpu_clear_wfe_traps(vcpu); + else + vcpu_set_wfe_traps(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -1044,6 +1050,32 @@ static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, return ret; } +static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + memset(events, 0, sizeof(*events)); + + return __kvm_arm_vcpu_get_events(vcpu, events); +} + +static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + int i; + + /* check whether the reserved field is zero */ + for (i = 0; i < ARRAY_SIZE(events->reserved); i++) + if (events->reserved[i]) + return -EINVAL; + + /* check whether the pad field is zero */ + for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) + if (events->exception.pad[i]) + return -EINVAL; + + return __kvm_arm_vcpu_set_events(vcpu, events); +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1124,6 +1156,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_arm_vcpu_has_attr(vcpu, &attr); break; } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (kvm_arm_vcpu_get_events(vcpu, &events)) + return -EINVAL; + + if (copy_to_user(argp, &events, sizeof(events))) + return -EFAULT; + + return 0; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + if (copy_from_user(&events, argp, sizeof(events))) + return -EFAULT; + + return kvm_arm_vcpu_set_events(vcpu, &events); + } default: r = -EINVAL; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 1d90d79706bd..91aaf73b00df 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -177,6 +177,35 @@ static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr put_page(virt_to_page(pmd)); } +static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) +{ + WRITE_ONCE(*ptep, new_pte); + dsb(ishst); +} + +static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) +{ + WRITE_ONCE(*pmdp, new_pmd); + dsb(ishst); +} + +static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) +{ + kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); +} + +static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) +{ + WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); + dsb(ishst); +} + +static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) +{ + WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); + dsb(ishst); +} + /* * Unmapping vs dcache management: * @@ -196,6 +225,10 @@ static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr * This is why right after unmapping a page/section and invalidating * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure * the IO subsystem will never hit in the cache. + * + * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as + * we then fully enforce cacheability of RAM, no matter what the guest + * does. */ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) @@ -576,7 +609,6 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, pte = pte_offset_kernel(pmd, addr); kvm_set_pte(pte, pfn_pte(pfn, prot)); get_page(virt_to_page(pte)); - kvm_flush_dcache_to_poc(pte, sizeof(*pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); } @@ -601,9 +633,8 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, kvm_err("Cannot allocate Hyp pte\n"); return -ENOMEM; } - pmd_populate_kernel(NULL, pmd, pte); + kvm_pmd_populate(pmd, pte); get_page(virt_to_page(pmd)); - kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); } next = pmd_addr_end(addr, end); @@ -634,9 +665,8 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, kvm_err("Cannot allocate Hyp pmd\n"); return -ENOMEM; } - pud_populate(NULL, pud, pmd); + kvm_pud_populate(pud, pmd); get_page(virt_to_page(pud)); - kvm_flush_dcache_to_poc(pud, sizeof(*pud)); } next = pud_addr_end(addr, end); @@ -671,9 +701,8 @@ static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, err = -ENOMEM; goto out; } - pgd_populate(NULL, pgd, pud); + kvm_pgd_populate(pgd, pud); get_page(virt_to_page(pgd)); - kvm_flush_dcache_to_poc(pgd, sizeof(*pgd)); } next = pgd_addr_end(addr, end); @@ -1015,19 +1044,35 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache pmd = stage2_get_pmd(kvm, cache, addr); VM_BUG_ON(!pmd); - /* - * Mapping in huge pages should only happen through a fault. If a - * page is merged into a transparent huge page, the individual - * subpages of that huge page should be unmapped through MMU - * notifiers before we get here. - * - * Merging of CompoundPages is not supported; they should become - * splitting first, unmapped, merged, and mapped back in on-demand. - */ - VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); - old_pmd = *pmd; if (pmd_present(old_pmd)) { + /* + * Multiple vcpus faulting on the same PMD entry, can + * lead to them sequentially updating the PMD with the + * same value. Following the break-before-make + * (pmd_clear() followed by tlb_flush()) process can + * hinder forward progress due to refaults generated + * on missing translations. + * + * Skip updating the page table if the entry is + * unchanged. + */ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + + /* + * Mapping in huge pages should only happen through a + * fault. If a page is merged into a transparent huge + * page, the individual subpages of that huge page + * should be unmapped through MMU notifiers before we + * get here. + * + * Merging of CompoundPages is not supported; they + * should become splitting first, unmapped, merged, + * and mapped back in on-demand. + */ + VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); + pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { @@ -1090,7 +1135,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ pte = mmu_memory_cache_alloc(cache); - pmd_populate_kernel(NULL, pmd, pte); + kvm_pmd_populate(pmd, pte); get_page(virt_to_page(pmd)); } @@ -1102,6 +1147,10 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, /* Create 2nd stage page table mapping - Level 3 */ old_pte = *pte; if (pte_present(old_pte)) { + /* Skip page table update if there is no change */ + if (pte_val(old_pte) == pte_val(*new_pte)) + return 0; + kvm_set_pte(pte, __pte(0)); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c index c589d4c2b478..07aa900bac56 100644 --- a/virt/kvm/arm/vgic/vgic-debug.c +++ b/virt/kvm/arm/vgic/vgic-debug.c @@ -36,9 +36,12 @@ struct vgic_state_iter { int nr_cpus; int nr_spis; + int nr_lpis; int dist_id; int vcpu_id; int intid; + int lpi_idx; + u32 *lpi_array; }; static void iter_next(struct vgic_state_iter *iter) @@ -52,6 +55,12 @@ static void iter_next(struct vgic_state_iter *iter) if (iter->intid == VGIC_NR_PRIVATE_IRQS && ++iter->vcpu_id < iter->nr_cpus) iter->intid = 0; + + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) { + if (iter->lpi_idx < iter->nr_lpis) + iter->intid = iter->lpi_array[iter->lpi_idx]; + iter->lpi_idx++; + } } static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, @@ -63,6 +72,11 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array); + if (iter->nr_lpis < 0) + iter->nr_lpis = 0; + } /* Fast forward to the right position if needed */ while (pos--) @@ -73,7 +87,8 @@ static bool end_of_vgic(struct vgic_state_iter *iter) { return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && - (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis; + iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && + iter->lpi_idx > iter->nr_lpis; } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) @@ -130,6 +145,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v) mutex_lock(&kvm->lock); iter = kvm->arch.vgic.iter; + kfree(iter->lpi_array); kfree(iter); kvm->arch.vgic.iter = NULL; mutex_unlock(&kvm->lock); @@ -137,17 +153,20 @@ static void vgic_debug_stop(struct seq_file *s, void *v) static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) { + bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; + seq_printf(s, "Distributor\n"); seq_printf(s, "===========\n"); - seq_printf(s, "vgic_model:\t%s\n", - (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) ? - "GICv3" : "GICv2"); + seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); + if (v3) + seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); seq_printf(s, "P=pending_latch, L=line_level, A=active\n"); seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n"); + seq_printf(s, "G=group\n"); } static void print_header(struct seq_file *s, struct vgic_irq *irq, @@ -162,8 +181,8 @@ static void print_header(struct seq_file *s, struct vgic_irq *irq, } seq_printf(s, "\n"); - seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHC HWID TARGET SRC PRI VCPU_ID\n", hdr, id); - seq_printf(s, "---------------------------------------------------------------\n"); + seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id); + seq_printf(s, "----------------------------------------------------------------\n"); } static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, @@ -174,15 +193,17 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, type = "SGI"; else if (irq->intid < VGIC_NR_PRIVATE_IRQS) type = "PPI"; - else + else if (irq->intid < VGIC_MAX_SPI) type = "SPI"; + else + type = "LPI"; if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS) print_header(s, irq, vcpu); seq_printf(s, " %s %4d " " %2d " - "%d%d%d%d%d%d " + "%d%d%d%d%d%d%d " "%8d " "%8x " " %2x " @@ -197,12 +218,12 @@ static void print_irq_state(struct seq_file *s, struct vgic_irq *irq, irq->enabled, irq->hw, irq->config == VGIC_CONFIG_LEVEL, + irq->group, irq->hwintid, irq->mpidr, irq->source, irq->priority, (irq->vcpu) ? irq->vcpu->vcpu_id : -1); - } static int vgic_debug_show(struct seq_file *s, void *v) @@ -221,17 +242,20 @@ static int vgic_debug_show(struct seq_file *s, void *v) if (!kvm->arch.vgic.initialized) return 0; - if (iter->vcpu_id < iter->nr_cpus) { + if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - irq = &vcpu->arch.vgic_cpu.private_irqs[iter->intid]; - } else { - irq = &kvm->arch.vgic.spis[iter->intid - VGIC_NR_PRIVATE_IRQS]; + + irq = vgic_get_irq(kvm, vcpu, iter->intid); + if (!irq) { + seq_printf(s, " LPI %4d freed\n", iter->intid); + return 0; } spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); spin_unlock_irqrestore(&irq->irq_lock, flags); + vgic_put_irq(kvm, irq); return 0; } diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 2673efce65f3..c0c0b88af1d5 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -175,10 +175,13 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) irq->vcpu = NULL; irq->target_vcpu = vcpu0; kref_init(&irq->refcount); - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { irq->targets = 0; - else + irq->group = 0; + } else { irq->mpidr = 0; + irq->group = 1; + } } return 0; } @@ -227,6 +230,18 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } + + /* + * GICv3 can only be created via the KVM_DEVICE_CREATE API and + * so we always know the emulation type at this point as it's + * either explicitly configured as GICv3, or explicitly + * configured as GICv2, or not configured yet which also + * implies GICv2. + */ + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + irq->group = 1; + else + irq->group = 0; } if (!irqchip_in_kernel(vcpu->kvm)) @@ -271,6 +286,10 @@ int vgic_init(struct kvm *kvm) if (vgic_initialized(kvm)) return 0; + /* Are we also in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + /* freeze the number of spis */ if (!dist->nr_spis) dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; @@ -294,6 +313,7 @@ int vgic_init(struct kvm *kvm) vgic_debug_init(kvm); + dist->implementation_rev = 2; dist->initialized = true; out: diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 4ed79c939fb4..12502251727e 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -71,6 +71,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, kref_init(&irq->refcount); irq->intid = intid; irq->target_vcpu = vcpu; + irq->group = 1; spin_lock_irqsave(&dist->lpi_list_lock, flags); @@ -168,8 +169,14 @@ struct vgic_its_abi { int (*commit)(struct vgic_its *its); }; +#define ABI_0_ESZ 8 +#define ESZ_MAX ABI_0_ESZ + static const struct vgic_its_abi its_table_abi_versions[] = { - [0] = {.cte_esz = 8, .dte_esz = 8, .ite_esz = 8, + [0] = { + .cte_esz = ABI_0_ESZ, + .dte_esz = ABI_0_ESZ, + .ite_esz = ABI_0_ESZ, .save_tables = vgic_its_save_tables_v0, .restore_tables = vgic_its_restore_tables_v0, .commit = vgic_its_commit_v0, @@ -183,7 +190,7 @@ inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) return &its_table_abi_versions[its->abi_rev]; } -int vgic_its_set_abi(struct vgic_its *its, int rev) +static int vgic_its_set_abi(struct vgic_its *its, u32 rev) { const struct vgic_its_abi *abi; @@ -312,9 +319,9 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, * enumerate those LPIs without holding any lock. * Returns their number and puts the kmalloc'ed array into intid_ptr. */ -static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr) +int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) { - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; unsigned long flags; u32 *intids; @@ -337,7 +344,7 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr) if (i == irq_count) break; /* We don't need to "get" the IRQ, as we hold the list lock. */ - if (irq->target_vcpu != vcpu) + if (vcpu && irq->target_vcpu != vcpu) continue; intids[i++] = irq->intid; } @@ -429,7 +436,7 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) unsigned long flags; u8 pendmask; - nr_irqs = vgic_copy_lpi_list(vcpu, &intids); + nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids); if (nr_irqs < 0) return nr_irqs; @@ -1154,7 +1161,7 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, vcpu = kvm_get_vcpu(kvm, collection->target_addr); - irq_count = vgic_copy_lpi_list(vcpu, &intids); + irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids); if (irq_count < 0) return irq_count; @@ -1202,7 +1209,7 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, vcpu1 = kvm_get_vcpu(kvm, target1_addr); vcpu2 = kvm_get_vcpu(kvm, target2_addr); - irq_count = vgic_copy_lpi_list(vcpu1, &intids); + irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids); if (irq_count < 0) return irq_count; @@ -1881,14 +1888,14 @@ typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry, * Return: < 0 on error, 0 if last element was identified, 1 otherwise * (the last element may not be found on second level tables) */ -static int scan_its_table(struct vgic_its *its, gpa_t base, int size, int esz, +static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, int start_id, entry_fn_t fn, void *opaque) { struct kvm *kvm = its->dev->kvm; unsigned long len = size; int id = start_id; gpa_t gpa = base; - char entry[esz]; + char entry[ESZ_MAX]; int ret; memset(entry, 0, esz); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index ffc587bf4742..738b65d2d0e7 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -22,22 +22,33 @@ #include "vgic.h" #include "vgic-mmio.h" +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 1: Report GICv2 interrupts as group 0 instead of group 1 + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; u32 value; switch (addr & 0x0c) { case GIC_DIST_CTRL: - value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0; + value = vgic->enabled ? GICD_ENABLE : 0; break; case GIC_DIST_CTR: - value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; value = (value >> 5) - 1; value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; break; case GIC_DIST_IIDR: - value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); break; default: return 0; @@ -66,6 +77,42 @@ static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, } } +static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + switch (addr & 0x0c) { + case GIC_DIST_IIDR: + if (val != vgic_mmio_read_v2_misc(vcpu, addr, len)) + return -EINVAL; + + /* + * If we observe a write to GICD_IIDR we know that userspace + * has been updated and has had a chance to cope with older + * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting + * interrupts as group 1, and therefore we now allow groups to + * be user writable. Doing this by default would break + * migration from old kernels to new kernels with legacy + * userspace. + */ + vcpu->kvm->arch.vgic.v2_groups_user_writable = true; + return 0; + } + + vgic_mmio_write_v2_misc(vcpu, addr, len, val); + return 0; +} + +static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (vcpu->kvm->arch.vgic.v2_groups_user_writable) + vgic_mmio_write_group(vcpu, addr, len, val); + + return 0; +} + static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, gpa_t addr, unsigned int len, unsigned long val) @@ -352,17 +399,22 @@ static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu, if (n > vgic_v3_max_apr_idx(vcpu)) return; + + n = array_index_nospec(n, 4); + /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */ vgicv3->vgic_ap1r[n] = val; } } static const struct vgic_register_region vgic_v2_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL, - vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12, - VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL, + vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, + NULL, vgic_mmio_uaccess_write_v2_misc, + 12, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, - vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1, + vgic_mmio_read_group, vgic_mmio_write_group, + NULL, vgic_mmio_uaccess_write_v2_group, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 287784095b5b..a2a175b08b17 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -59,19 +59,27 @@ bool vgic_supports_direct_msis(struct kvm *kvm) return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); } +/* + * The Revision field in the IIDR have the following meanings: + * + * Revision 2: Interrupt groups are guest-configurable and signaled using + * their configured groups. + */ + static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; u32 value = 0; switch (addr & 0x0c) { case GICD_CTLR: - if (vcpu->kvm->arch.vgic.enabled) + if (vgic->enabled) value |= GICD_CTLR_ENABLE_SS_G1; value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; break; case GICD_TYPER: - value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS; value = (value >> 5) - 1; if (vgic_has_its(vcpu->kvm)) { value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; @@ -81,7 +89,9 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, } break; case GICD_IIDR: - value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); + value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) | + (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) | + (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT); break; default: return 0; @@ -110,6 +120,20 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, } } +static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + switch (addr & 0x0c) { + case GICD_IIDR: + if (val != vgic_mmio_read_v3_misc(vcpu, addr, len)) + return -EINVAL; + } + + vgic_mmio_write_v3_misc(vcpu, addr, len, val); + return 0; +} + static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -246,9 +270,9 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, return value; } -static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val) +static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; @@ -273,6 +297,8 @@ static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, vgic_put_irq(vcpu->kvm, irq); } + + return 0; } /* We want to avoid outer shareable. */ @@ -444,14 +470,15 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, } static const struct vgic_register_region vgic_v3_dist_registers[] = { - REGISTER_DESC_WITH_LENGTH(GICD_CTLR, - vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16, - VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR, + vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, + NULL, vgic_mmio_uaccess_write_v3_misc, + 16, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICD_STATUSR, vgic_mmio_read_rao, vgic_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, - vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1, + vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, @@ -465,7 +492,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, vgic_mmio_read_pending, vgic_mmio_write_cpending, - vgic_mmio_read_raz, vgic_mmio_write_wi, 1, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, vgic_mmio_read_active, vgic_mmio_write_sactive, @@ -524,7 +551,7 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = { static const struct vgic_register_region vgic_v3_sgibase_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0, - vgic_mmio_read_rao, vgic_mmio_write_wi, 4, + vgic_mmio_read_group, vgic_mmio_write_group, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0, vgic_mmio_read_enable, vgic_mmio_write_senable, 4, @@ -538,7 +565,7 @@ static const struct vgic_register_region vgic_v3_sgibase_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICPENDR0, vgic_mmio_read_pending, vgic_mmio_write_cpending, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISACTIVER0, vgic_mmio_read_active, vgic_mmio_write_sactive, @@ -873,7 +900,8 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) /** * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs * @vcpu: The VCPU requesting a SGI - * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU + * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU + * @allow_group1: Does the sysreg access allow generation of G1 SGIs * * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. * This will trap in sys_regs.c and call this function. @@ -883,7 +911,7 @@ static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) * check for matching ones. If this bit is set, we signal all, but not the * calling VCPU. */ -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *c_vcpu; @@ -932,9 +960,19 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + /* + * An access targetting Group0 SGIs can only generate + * those, while an access targetting Group1 SGIs can + * generate interrupts of either group. + */ + if (!irq->group || allow_group1) { + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + } else { + spin_unlock_irqrestore(&irq->irq_lock, flags); + } + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index ff9655cfeb2f..f56ff1cf52ec 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -40,6 +40,51 @@ void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, /* Ignore */ } +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ + return 0; +} + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->group) + value |= BIT(i); + + vgic_put_irq(vcpu->kvm, irq); + } + + return value; +} + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock_irqsave(&irq->irq_lock, flags); + irq->group = !!(val & BIT(i)); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } +} + /* * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value * of the enabled bit, so there is only one function for both here. @@ -363,11 +408,12 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, mutex_unlock(&vcpu->kvm->lock); } -void vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __vgic_mmio_write_cactive(vcpu, addr, len, val); + return 0; } static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, @@ -399,11 +445,12 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, mutex_unlock(&vcpu->kvm->lock); } -void vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __vgic_mmio_write_sactive(vcpu, addr, len, val); + return 0; } unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, @@ -735,10 +782,9 @@ static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; if (region->uaccess_write) - region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); - else - region->write(r_vcpu, addr, sizeof(u32), *val); + return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); + region->write(r_vcpu, addr, sizeof(u32), *val); return 0; } diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 5693f6df45ec..a07f90acdaec 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -37,8 +37,8 @@ struct vgic_register_region { unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); union { - void (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len, unsigned long val); + int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val); @@ -134,6 +134,15 @@ unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); +int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + +void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); @@ -167,13 +176,13 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); -void vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); +int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); -void vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len, - unsigned long val); +int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index a5f2e44f1c33..69b892abd7dc 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -62,7 +62,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; int lr; - unsigned long flags; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); cpuif->vgic_hcr &= ~GICH_HCR_UIE; @@ -83,7 +84,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - spin_lock_irqsave(&irq->irq_lock, flags); + spin_lock(&irq->irq_lock); /* Always preserve the active bit */ irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -126,7 +127,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) vgic_irq_set_phys_active(irq, false); } - spin_unlock_irqrestore(&irq->irq_lock, flags); + spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -159,6 +160,9 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) } } + if (irq->group) + val |= GICH_LR_GROUP1; + if (irq->hw) { val |= GICH_LR_HW; val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index cdce653e3c47..9c0dd234ebe8 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -46,7 +46,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; int lr; - unsigned long flags; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); cpuif->vgic_hcr &= ~ICH_HCR_UIE; @@ -75,7 +76,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (!irq) /* An LPI could have been unmapped. */ continue; - spin_lock_irqsave(&irq->irq_lock, flags); + spin_lock(&irq->irq_lock); /* Always preserve the active bit */ irq->active = !!(val & ICH_LR_ACTIVE_BIT); @@ -118,7 +119,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) vgic_irq_set_phys_active(irq, false); } - spin_unlock_irqrestore(&irq->irq_lock, flags); + spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -197,11 +198,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; - /* - * We currently only support Group1 interrupts, which is a - * known defect. This needs to be addressed at some point. - */ - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) + if (irq->group) val |= ICH_LR_GROUP; val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 33c8325c8f35..7cfdfbc910e0 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -28,12 +28,6 @@ #define CREATE_TRACE_POINTS #include "trace.h" -#ifdef CONFIG_DEBUG_SPINLOCK -#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) -#else -#define DEBUG_SPINLOCK_BUG_ON(p) -#endif - struct vgic_global kvm_vgic_global_state __ro_after_init = { .gicv3_cpuif = STATIC_KEY_FALSE_INIT, }; @@ -599,10 +593,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; - unsigned long flags; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); retry: - spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + spin_lock(&vgic_cpu->ap_list_lock); list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; @@ -643,7 +638,7 @@ retry: /* This interrupt looks like it has to be migrated. */ spin_unlock(&irq->irq_lock); - spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + spin_unlock(&vgic_cpu->ap_list_lock); /* * Ensure locking order by always locking the smallest @@ -657,7 +652,7 @@ retry: vcpuB = vcpu; } - spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags); + spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, SINGLE_DEPTH_NESTING); spin_lock(&irq->irq_lock); @@ -682,7 +677,7 @@ retry: spin_unlock(&irq->irq_lock); spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); - spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock, flags); + spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); if (target_vcpu_needs_kick) { kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); @@ -692,7 +687,7 @@ retry: goto retry; } - spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + spin_unlock(&vgic_cpu->ap_list_lock); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index ead00b2072b2..a90024718ca4 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -103,6 +103,12 @@ #define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) #define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 +#ifdef CONFIG_DEBUG_SPINLOCK +#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) +#else +#define DEBUG_SPINLOCK_BUG_ON(p) +#endif + /* Requires the irq_lock to be held by the caller. */ static inline bool irq_is_pending(struct vgic_irq *irq) { @@ -305,6 +311,7 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); } +int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); |