diff options
Diffstat (limited to 'arch')
30 files changed, 343 insertions, 127 deletions
diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index 5edec2f49ec9..deff21bfa680 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -314,36 +314,32 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs); unsigned int type; u32 instr = 0; - u16 tinstr = 0; int isize = 4; int thumb2_32b = 0; - int fault; instrptr = instruction_pointer(regs); if (compat_thumb_mode(regs)) { __le16 __user *ptr = (__le16 __user *)(instrptr & ~1); + u16 tinstr, tinst2; - fault = alignment_get_thumb(regs, ptr, &tinstr); - if (!fault) { - if (IS_T32(tinstr)) { - /* Thumb-2 32-bit */ - u16 tinst2; - fault = alignment_get_thumb(regs, ptr + 1, &tinst2); - instr = ((u32)tinstr << 16) | tinst2; - thumb2_32b = 1; - } else { - isize = 2; - instr = thumb2arm(tinstr); - } + if (alignment_get_thumb(regs, ptr, &tinstr)) + return 1; + + if (IS_T32(tinstr)) { /* Thumb-2 32-bit */ + if (alignment_get_thumb(regs, ptr + 1, &tinst2)) + return 1; + instr = ((u32)tinstr << 16) | tinst2; + thumb2_32b = 1; + } else { + isize = 2; + instr = thumb2arm(tinstr); } } else { - fault = alignment_get_arm(regs, (__le32 __user *)instrptr, &instr); + if (alignment_get_arm(regs, (__le32 __user *)instrptr, &instr)) + return 1; } - if (fault) - return 1; - switch (CODING_BITS(instr)) { case 0x00000000: /* 3.13.4 load/store instruction extensions */ if (LDSTHD_I_BIT(instr)) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3bd732eaf087..3f6a5efdbcf0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -220,6 +220,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7113587222ff..3b9d4d24c361 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, }; + unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ u32 level = ~0; int ret; + /* + * Disable IRQs so that we hazard against a concurrent + * teardown of the userspace page tables (which relies on + * IPI-ing threads). + */ + local_irq_save(flags); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); - VM_BUG_ON(ret); - VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); - VM_BUG_ON(!(pte & PTE_VALID)); + local_irq_restore(flags); + + if (ret) + return ret; + + /* + * Not seeing an error, but not updating level? Something went + * deeply wrong... + */ + if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EFAULT; + + /* Oops, the userspace PTs are gone... Replay the fault */ + if (!kvm_pte_valid(pte)) + return -EAGAIN; return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } @@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * * Returns the size of the mapping. */ -static unsigned long +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) @@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - get_user_mapping_size(kvm, hva) >= PMD_SIZE) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + int sz = get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, { int ret = 0; bool write_fault, writable, force_pte = false; - bool exec_fault; + bool exec_fault, mte_allowed; bool device = false; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; @@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); - unsigned long vma_pagesize, fault_granule; + long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; @@ -1218,6 +1244,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != ESR_ELx_FSC_PERM || + (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } + + /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ @@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_ipa &= ~(vma_pagesize - 1); gfn = fault_ipa >> PAGE_SHIFT; - mmap_read_unlock(current->mm); + mte_allowed = kvm_vma_mte_allowed(vma); - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; - } + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_invalidate_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_gfn will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_<page|range_end>. + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * acquiring kvm->mmu_lock. * - * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is - * used to avoid unnecessary overhead introduced to locate the memory - * slot because it's always fixed even @gfn is adjusted for huge pages. + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - smp_rmb(); + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); @@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); + + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } } if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ - if (kvm_vma_mte_allowed(vma)) { + if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); } else { ret = -EFAULT; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 24908400e190..c243b10f3e15 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) if (!kvm_pmu_is_3p5(vcpu)) val &= ~ARMV8_PMU_PMCR_LP; - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + /* The reset bits don't indicate any state, and shouldn't be saved. */ + __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 53749d3a0996..1b2c161120be 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -856,6 +856,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) return true; } +static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + *val = kvm_pmu_get_counter_value(vcpu, idx); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1072,7 +1088,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ - .reset = reset_pmevcntr, \ + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ @@ -1982,7 +1998,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c index 33788668cbdb..3779e7855bd7 100644 --- a/arch/mips/bmips/dma.c +++ b/arch/mips/bmips/dma.c @@ -5,6 +5,8 @@ #include <asm/bmips.h> #include <asm/io.h> +bool bmips_rac_flush_disable; + void arch_sync_dma_for_cpu_all(void) { void __iomem *cbr = BMIPS_GET_CBR(); @@ -15,6 +17,9 @@ void arch_sync_dma_for_cpu_all(void) boot_cpu_type() != CPU_BMIPS4380) return; + if (unlikely(bmips_rac_flush_disable)) + return; + /* Flush stale data out of the readahead cache */ cfg = __raw_readl(cbr + BMIPS_RAC_CONFIG); __raw_writel(cfg | 0x100, cbr + BMIPS_RAC_CONFIG); diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index e95b3f78e7cd..549a6392a3d2 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -35,6 +35,8 @@ #define REG_BCM6328_OTP ((void __iomem *)CKSEG1ADDR(0x1000062c)) #define BCM6328_TP1_DISABLED BIT(9) +extern bool bmips_rac_flush_disable; + static const unsigned long kbase = VMLINUX_LOAD_ADDRESS & 0xfff00000; struct bmips_quirk { @@ -104,6 +106,12 @@ static void bcm6358_quirks(void) * disable SMP for now */ bmips_smp_enabled = 0; + + /* + * RAC flush causes kernel panics on BCM6358 when booting from TP1 + * because the bootloader is not initializing it properly. + */ + bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)); } static void bcm6368_quirks(void) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 2bbc0fcce04a..5e26c7f2c25a 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -148,6 +148,11 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, */ } +static inline bool __pte_protnone(unsigned long pte) +{ + return (pte & (pgprot_val(PAGE_NONE) | _PAGE_RWX)) == pgprot_val(PAGE_NONE); +} + static inline bool __pte_flags_need_flush(unsigned long oldval, unsigned long newval) { @@ -164,8 +169,8 @@ static inline bool __pte_flags_need_flush(unsigned long oldval, /* * We do not expect kernel mappings or non-PTEs or not-present PTEs. */ - VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED); - VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(!__pte_protnone(oldval) && oldval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(!__pte_protnone(newval) && newval & _PAGE_PRIVILEGED); VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE)); VM_WARN_ON_ONCE(!(newval & _PAGE_PTE)); VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT)); diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 2087a785f05f..5fff0d04b23f 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -290,6 +290,9 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, static int ppr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { + if (!target->thread.regs) + return -EINVAL; + return membuf_write(&to, &target->thread.regs->ppr, sizeof(u64)); } @@ -297,6 +300,9 @@ static int ppr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + if (!target->thread.regs) + return -EINVAL; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.regs->ppr, 0, sizeof(u64)); } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4c5405fc5538..d23e25e8432d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -576,6 +576,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif +#ifdef CONFIG_HAVE_KVM_IRQFD + case KVM_CAP_IRQFD_RESAMPLE: + r = !xive_enabled(); + break; +#endif + case KVM_CAP_PPC_ALLOC_HTAB: r = hv_enabled; break; diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 559112312810..513180467562 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -856,6 +856,13 @@ int pseries_vas_dlpar_cpu(void) { int new_nr_creds, rc; + /* + * NX-GZIP is not enabled. Nothing to do for DLPAR event + */ + if (!copypaste_feat) + return 0; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, (u64)virt_to_phys(&hv_cop_caps)); @@ -1012,6 +1019,7 @@ static int __init pseries_vas_init(void) * Linux supports user space COPY/PASTE only with Radix */ if (!radix_enabled()) { + copypaste_feat = false; pr_err("API is supported only with radix page tables\n"); return -ENOTSUPP; } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 5b182d1c196c..eb7f29a412f8 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -126,6 +126,7 @@ config RISCV select OF_IRQ select PCI_DOMAINS_GENERIC if PCI select PCI_MSI if PCI + select RISCV_ALTERNATIVE if !XIP_KERNEL select RISCV_INTC select RISCV_TIMER if RISCV_SBI select SIFIVE_PLIC @@ -401,9 +402,8 @@ config RISCV_ISA_C config RISCV_ISA_SVPBMT bool "SVPBMT extension support" depends on 64BIT && MMU - depends on !XIP_KERNEL + depends on RISCV_ALTERNATIVE default y - select RISCV_ALTERNATIVE help Adds support to dynamically detect the presence of the SVPBMT ISA-extension (Supervisor-mode: page-based memory types) and @@ -428,8 +428,8 @@ config TOOLCHAIN_HAS_ZBB config RISCV_ISA_ZBB bool "Zbb extension support for bit manipulation instructions" depends on TOOLCHAIN_HAS_ZBB - depends on !XIP_KERNEL && MMU - select RISCV_ALTERNATIVE + depends on MMU + depends on RISCV_ALTERNATIVE default y help Adds support to dynamically detect the presence of the ZBB @@ -443,9 +443,9 @@ config RISCV_ISA_ZBB config RISCV_ISA_ZICBOM bool "Zicbom extension support for non-coherent DMA operation" - depends on !XIP_KERNEL && MMU + depends on MMU + depends on RISCV_ALTERNATIVE default y - select RISCV_ALTERNATIVE select RISCV_DMA_NONCOHERENT help Adds support to dynamically detect the presence of the ZICBOM diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index 69621ae6d647..0c8f4652cd82 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -2,8 +2,7 @@ menu "CPU errata selection" config ERRATA_SIFIVE bool "SiFive errata" - depends on !XIP_KERNEL - select RISCV_ALTERNATIVE + depends on RISCV_ALTERNATIVE help All SiFive errata Kconfig depend on this Kconfig. Disabling this Kconfig will disable all SiFive errata. Please say "Y" @@ -35,8 +34,7 @@ config ERRATA_SIFIVE_CIP_1200 config ERRATA_THEAD bool "T-HEAD errata" - depends on !XIP_KERNEL - select RISCV_ALTERNATIVE + depends on RISCV_ALTERNATIVE help All T-HEAD errata Kconfig depend on this Kconfig. Disabling this Kconfig will disable all T-HEAD errata. Please say "Y" diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index e3021b2590de..6263a0de1c6a 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -57,18 +57,31 @@ struct riscv_isa_ext_data { unsigned int isa_ext_id; }; +unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); + +#define riscv_isa_extension_mask(ext) BIT_MASK(RISCV_ISA_EXT_##ext) + +bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit); +#define riscv_isa_extension_available(isa_bitmap, ext) \ + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext) + static __always_inline bool riscv_has_extension_likely(const unsigned long ext) { compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); - asm_volatile_goto( - ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1) - : - : [ext] "i" (ext) - : - : l_no); + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + asm_volatile_goto( + ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1) + : + : [ext] "i" (ext) + : + : l_no); + } else { + if (!__riscv_isa_extension_available(NULL, ext)) + goto l_no; + } return true; l_no: @@ -81,26 +94,23 @@ riscv_has_extension_unlikely(const unsigned long ext) compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); - asm_volatile_goto( - ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1) - : - : [ext] "i" (ext) - : - : l_yes); + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + asm_volatile_goto( + ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1) + : + : [ext] "i" (ext) + : + : l_yes); + } else { + if (__riscv_isa_extension_available(NULL, ext)) + goto l_yes; + } return false; l_yes: return true; } -unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); - -#define riscv_isa_extension_mask(ext) BIT_MASK(RISCV_ISA_EXT_##ext) - -bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit); -#define riscv_isa_extension_available(isa_bitmap, ext) \ - __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext) - #endif #endif /* _ASM_RISCV_HWCAP_H */ diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index ad34519c8a13..3ac2ff6a65da 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -147,10 +147,8 @@ static void kvm_riscv_vcpu_timer_blocking(struct kvm_vcpu *vcpu) return; delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t); - if (delta_ns) { - hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); - t->next_set = true; - } + hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); + t->next_set = true; } static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index b3235ab0ace8..ed646c583e4f 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -162,7 +162,7 @@ vdso_prepare: prepare0 ifdef CONFIG_EXPOLINE_EXTERN modules_prepare: expoline_prepare -expoline_prepare: +expoline_prepare: scripts $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o endif endif diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index cf9659e13f03..ea244a73efad 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -474,9 +474,7 @@ long arch_ptrace(struct task_struct *child, long request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned long __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned long __user *)data); case PTRACE_ENABLE_TE: if (!MACHINE_HAS_TE) return -EIO; @@ -824,9 +822,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned int __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned int __user *)data); } return compat_ptrace_request(child, request, addr, data); } diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 0ee02dae14b2..2cda8d9d7c6e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -271,10 +271,18 @@ static int handle_prog(struct kvm_vcpu *vcpu) * handle_external_interrupt - used for external interruption interceptions * @vcpu: virtual cpu * - * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if - * the new PSW does not have external interrupts disabled. In the first case, - * we've got to deliver the interrupt manually, and in the second case, we - * drop to userspace to handle the situation there. + * This interception occurs if: + * - the CPUSTAT_EXT_INT bit was already set when the external interrupt + * occurred. In this case, the interrupt needs to be injected manually to + * preserve interrupt priority. + * - the external new PSW has external interrupts enabled, which will cause an + * interruption loop. We drop to userspace in this case. + * + * The latter case can be detected by inspecting the external mask bit in the + * external new psw. + * + * Under PV, only the latter case can occur, since interrupt priorities are + * handled in the ultravisor. */ static int handle_external_interrupt(struct kvm_vcpu *vcpu) { @@ -285,10 +293,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) vcpu->stat.exit_external_interrupt++; - rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); - if (rc) - return rc; - /* We can not handle clock comparator or timer interrupt with bad PSW */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + newpsw = vcpu->arch.sie_block->gpsw; + } else { + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + } + + /* + * Clock comparator or timer interrupt with external interrupt enabled + * will cause interrupt loop. Drop to userspace. + */ if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && (newpsw.mask & PSW_MASK_EXT)) return -EOPNOTSUPP; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 39b36562c043..1eeb9ae57879 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -573,6 +573,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 720036fb1924..d44214072779 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -172,7 +172,7 @@ unsigned long __clear_user(void __user *to, unsigned long size) "4: slgr %0,%0\n" "5:\n" EX_TABLE(0b,2b) EX_TABLE(6b,2b) EX_TABLE(3b,5b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) + : "+&a" (size), "+&a" (to), "+a" (tmp1), "=&a" (tmp2) : "a" (empty_zero_page), [spec] "d" (spec.val) : "cc", "memory", "0"); return size; diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index cbaf174d8efd..b3af2d45bbbb 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -125,6 +125,8 @@ #define INTEL_FAM6_LUNARLAKE_M 0xBD +#define INTEL_FAM6_ARROWLAKE 0xC6 + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1c38174b5f01..0dac4ab5b55b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -146,7 +146,11 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } - if (madt->header.revision >= 5) + + /* ACPI 6.3 and newer support the online capable bit. */ + if (acpi_gbl_FADT.header.revision > 6 || + (acpi_gbl_FADT.header.revision == 6 && + acpi_gbl_FADT.minor_revision >= 3)) acpi_support_online_capable = true; default_acpi_madt_oem_check(madt->header.oem_id, @@ -193,7 +197,8 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags) if (lapic_flags & ACPI_MADT_ENABLED) return true; - if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + if (!acpi_support_online_capable || + (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) return true; return false; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f36dc2f796c5..f1197366a97d 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -358,12 +358,16 @@ static void __init ms_hyperv_init_platform(void) * To mirror what Windows does we should extract CPU management * features and use the ReservedIdentityBit to detect if Linux is the * root partition. But that requires negotiating CPU management - * interface (a process to be finalized). + * interface (a process to be finalized). For now, use the privilege + * flag as the indicator for running as root. * - * For now, use the privilege flag as the indicator for running as - * root. + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. */ - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { hv_root_partition = true; pr_info("Hyper-V: running as root partition\n"); } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 042dee556125..995eb5054360 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -368,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG - && ioapic->irr & (1 << index)) - ioapic_service(ioapic, index, false); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && + ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) { + /* + * Pending status in irr may be outdated: the IRQ line may have + * already been deasserted by a device while the IRQ was masked. + * This occurs, for instance, if the interrupt is handled in a + * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this + * case the guest acknowledges the interrupt to the device in + * its threaded irq handler, i.e. after the EOI but before + * unmasking, so at the time of unmasking the IRQ line is + * already down but our pending irr bit is still set. In such + * cases, injecting this pending interrupt to the guest is + * buggy: the guest will receive an extra unwanted interrupt. + * + * So we need to check here if the IRQ is actually still pending. + * As we are generally not able to probe the IRQ line status + * directly, we do it through irqfd resampler. Namely, we clear + * the pending status and notify the resampler that this interrupt + * is done, without actually injecting it into the guest. If the + * IRQ line is actually already deasserted, we are done. If it is + * still asserted, a new interrupt will be shortly triggered + * through irqfd and injected into the guest. + * + * If, however, it's not possible to resample (no irqfd resampler + * registered for this irq), then unconditionally inject this + * pending interrupt into the guest, so the guest will not miss + * an interrupt, although may get an extra unwanted interrupt. + */ + if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index)) + ioapic->irr &= ~(1 << index); + else + ioapic_service(ioapic, index, false); + } if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 287e98ef9df3..6272dabec02d 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -12,6 +12,11 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, int hv_remote_flush_tlb(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ +static inline int hv_remote_flush_tlb(struct kvm *kvm) +{ + return -EOPNOTSUPP; +} + static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 252e7f37e4e2..f25bc3cbb250 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3729,7 +3729,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3753,6 +3753,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) svm->current_vmcb->asid_generation--; } +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + hpa_t root_tdp = vcpu->arch.mmu->root.hpa; + + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly + * flush the NPT mappings via hypercall as flushing the ASID only + * affects virtual to physical mappings, it does not invalidate guest + * physical to host physical mappings. + */ + if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) + hyperv_flush_guest_mapping(root_tdp); + + svm_flush_tlb_asid(vcpu); +} + +static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB + * flushes should be routed to hv_remote_flush_tlb() without requesting + * a "regular" remote flush. Reaching this point means either there's + * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of + * which might be fatal to the guest. Yell, but try to recover. + */ + if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) + hv_remote_flush_tlb(vcpu->kvm); + + svm_flush_tlb_asid(vcpu); +} + static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4745,10 +4776,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_current, + .flush_tlb_guest = svm_flush_tlb_asid, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index cff838f15db5..786d46d73a8e 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -6,6 +6,8 @@ #ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ +#include <asm/mshyperv.h> + #if IS_ENABLED(CONFIG_HYPERV) #include "kvm_onhyperv.h" @@ -15,6 +17,14 @@ static struct kvm_x86_ops svm_x86_ops; int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); +static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) +{ + struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; + + return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB && + !!hve->hv_enlightenments_control.enlightened_npt_tlb; +} + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; @@ -80,6 +90,11 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) } #else +static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1bc2b80273c9..768487611db7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3868,7 +3868,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) exit_qual = 0; } - if (ex->has_error_code) { + /* + * Unlike AMD's Paged Real Mode, which reports an error code on #PF + * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the + * "has error code" flags on VM-Exit if the CPU is in Real Mode. + */ + if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7713420abab0..3d852ce84920 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4432,6 +4432,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -8903,6 +8904,8 @@ restart: } if (ctxt->have_exception) { + WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); + vcpu->mmio_needed = false; r = 1; inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { @@ -9906,13 +9909,20 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + /* + * Suppress the error code if the vCPU is in Real Mode, as Real Mode + * exceptions don't report error codes. The presence of an error code + * is carried with the exception and only stripped when the exception + * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do + * report an error code despite the CPU being in Real Mode. + */ + vcpu->arch.exception.has_error_code &= is_protmode(vcpu); + trace_kvm_inj_exception(vcpu->arch.exception.vector, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.injected); - if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) - vcpu->arch.exception.error_code = false; static_call(kvm_x86_inject_exception)(vcpu); } diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index cd98366a9b23..f0a7d1c2641e 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -539,7 +539,7 @@ static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { - size_t len; + size_t len, off = 0; if (!sp) sp = stack_pointer(task); @@ -548,9 +548,17 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) kstack_depth_to_print * STACK_DUMP_ENTRY_SIZE); printk("%sStack:\n", loglvl); - print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, - STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, - sp, len, false); + while (off < len) { + u8 line[STACK_DUMP_LINE_SIZE]; + size_t line_len = len - off > STACK_DUMP_LINE_SIZE ? + STACK_DUMP_LINE_SIZE : len - off; + + __memcpy(line, (u8 *)sp + off, line_len); + print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, + STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, + line, line_len, false); + off += STACK_DUMP_LINE_SIZE; + } show_trace(task, sp, loglvl); } |