diff options
Diffstat (limited to 'arch')
725 files changed, 12181 insertions, 10521 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index c47b328eada0..e8d19c3cb91f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -260,6 +260,14 @@ config ARCH_HAS_SET_MEMORY config ARCH_HAS_SET_DIRECT_MAP bool +# +# Select if arch has an uncached kernel segment and provides the +# uncached_kernel_address / cached_kernel_address symbols to use it +# +config ARCH_HAS_UNCACHED_SEGMENT + select ARCH_HAS_DMA_PREP_COHERENT + bool + # Select if arch init_task must go in the __init_task_data section config ARCH_TASK_STRUCT_ON_STACK bool diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 150a1c5d6a2c..2144530d1428 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -93,9 +93,9 @@ static inline int atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ } #define ATOMIC64_OP(op, asm_op) \ -static __inline__ void atomic64_##op(long i, atomic64_t * v) \ +static __inline__ void atomic64_##op(s64 i, atomic64_t * v) \ { \ - unsigned long temp; \ + s64 temp; \ __asm__ __volatile__( \ "1: ldq_l %0,%1\n" \ " " #asm_op " %0,%2,%0\n" \ @@ -109,9 +109,9 @@ static __inline__ void atomic64_##op(long i, atomic64_t * v) \ } \ #define ATOMIC64_OP_RETURN(op, asm_op) \ -static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \ +static __inline__ s64 atomic64_##op##_return_relaxed(s64 i, atomic64_t * v) \ { \ - long temp, result; \ + s64 temp, result; \ __asm__ __volatile__( \ "1: ldq_l %0,%1\n" \ " " #asm_op " %0,%3,%2\n" \ @@ -128,9 +128,9 @@ static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \ } #define ATOMIC64_FETCH_OP(op, asm_op) \ -static __inline__ long atomic64_fetch_##op##_relaxed(long i, atomic64_t * v) \ +static __inline__ s64 atomic64_fetch_##op##_relaxed(s64 i, atomic64_t * v) \ { \ - long temp, result; \ + s64 temp, result; \ __asm__ __volatile__( \ "1: ldq_l %2,%1\n" \ " " #asm_op " %2,%3,%0\n" \ @@ -246,9 +246,9 @@ static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ long atomic64_fetch_add_unless(atomic64_t *v, long a, long u) +static __inline__ s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { - long c, new, old; + s64 c, new, old; smp_mb(); __asm__ __volatile__( "1: ldq_l %[old],%[mem]\n" @@ -276,9 +276,9 @@ static __inline__ long atomic64_fetch_add_unless(atomic64_t *v, long a, long u) * The function returns the old value of *v minus 1, even if * the atomic variable, v, was not decremented. */ -static inline long atomic64_dec_if_positive(atomic64_t *v) +static inline s64 atomic64_dec_if_positive(atomic64_t *v) { - long old, tmp; + s64 old, tmp; smp_mb(); __asm__ __volatile__( "1: ldq_l %[old],%[mem]\n" diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 02f9f91bb4f0..71ded3b7d82d 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -5,6 +5,8 @@ #include <linux/mm.h> #include <linux/mmzone.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + /* * Allocate and free page tables. The xxx_kernel() versions are * used to allocate a kernel page table - this turns on ASN bits @@ -41,7 +43,7 @@ pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + pmd_t *ret = (pmd_t *)__get_free_page(GFP_PGTABLE_USER); return ret; } @@ -51,42 +53,6 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - return pte; -} - -static inline void -pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - pte_t *pte = pte_alloc_one_kernel(mm); - struct page *page; - - if (!pte) - return NULL; - page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline void -pte_free(struct mm_struct *mm, pgtable_t page) -{ - pgtable_page_dtor(page); - __free_page(page); -} - #define check_pgt_cache() do { } while (0) #endif /* _ALPHA_PGALLOC_H */ diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 976e89b116e5..de6c4df61082 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -122,6 +122,8 @@ #define SO_RCVTIMEO_NEW 66 #define SO_SNDTIMEO_NEW 67 +#define SO_DETACH_REUSEPORT_BPF 68 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 33e904a05881..a813020d2f11 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -225,7 +225,7 @@ do_sigreturn(struct sigcontext __user *sc) return; give_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } asmlinkage void @@ -253,7 +253,7 @@ do_rt_sigreturn(struct rt_sigframe __user *frame) return; give_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index d0dccae53ba9..5f90df30be20 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -614,8 +614,7 @@ void smp_imb(void) { /* Must wait other processors to flush their icache before continue. */ - if (on_each_cpu(ipi_imb, NULL, 1)) - printk(KERN_CRIT "smp_imb: timed out\n"); + on_each_cpu(ipi_imb, NULL, 1); } EXPORT_SYMBOL(smp_imb); @@ -630,9 +629,7 @@ flush_tlb_all(void) { /* Although we don't have any data to pass, we do want to synchronize with the other processors. */ - if (on_each_cpu(ipi_flush_tlb_all, NULL, 1)) { - printk(KERN_CRIT "flush_tlb_all: timed out\n"); - } + on_each_cpu(ipi_flush_tlb_all, NULL, 1); } #define asn_locked() (cpu_data[smp_processor_id()].asn_lock) @@ -667,9 +664,7 @@ flush_tlb_mm(struct mm_struct *mm) } } - if (smp_call_function(ipi_flush_tlb_mm, mm, 1)) { - printk(KERN_CRIT "flush_tlb_mm: timed out\n"); - } + smp_call_function(ipi_flush_tlb_mm, mm, 1); preempt_enable(); } @@ -720,9 +715,7 @@ flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) data.mm = mm; data.addr = addr; - if (smp_call_function(ipi_flush_tlb_page, &data, 1)) { - printk(KERN_CRIT "flush_tlb_page: timed out\n"); - } + smp_call_function(ipi_flush_tlb_page, &data, 1); preempt_enable(); } @@ -772,9 +765,7 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, } } - if (smp_call_function(ipi_flush_icache_page, mm, 1)) { - printk(KERN_CRIT "flush_icache_page: timed out\n"); - } + smp_call_function(ipi_flush_icache_page, mm, 1); preempt_enable(); } diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 9e7704e44f6d..1db9bbcfb84e 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -473,3 +473,4 @@ 541 common fsconfig sys_fsconfig 542 common fsmount sys_fsmount 543 common fspick sys_fspick +544 common pidfd_open sys_pidfd_open diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index bc9627698796..f6b9664ac504 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -402,7 +402,7 @@ do_entDbg(struct pt_regs *regs) { die_if_kernel("Instruction fault", regs, 0, NULL); - force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current); + force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0); } diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 188fc9256baf..741e61ef9d3f 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -221,13 +221,13 @@ retry: up_read(&mm->mmap_sem); /* Send a sigbus, regardless of whether we were in kernel or user mode. */ - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0); if (!user_mode(regs)) goto no_context; return; do_sigsegv: - force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0, current); + force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0); return; #ifdef CONFIG_ALPHA_LARGE_VMALLOC diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c index 310a4ce1dccc..1b1259c7d7d1 100644 --- a/arch/alpha/oprofile/common.c +++ b/arch/alpha/oprofile/common.c @@ -65,7 +65,7 @@ op_axp_setup(void) model->reg_setup(®, ctr, &sys); /* Configure the registers on all cpus. */ - (void)smp_call_function(model->cpu_setup, ®, 1); + smp_call_function(model->cpu_setup, ®, 1); model->cpu_setup(®); return 0; } @@ -86,7 +86,7 @@ op_axp_cpu_start(void *dummy) static int op_axp_start(void) { - (void)smp_call_function(op_axp_cpu_start, NULL, 1); + smp_call_function(op_axp_cpu_start, NULL, 1); op_axp_cpu_start(NULL); return 0; } @@ -101,7 +101,7 @@ op_axp_cpu_stop(void *dummy) static void op_axp_stop(void) { - (void)smp_call_function(op_axp_cpu_stop, NULL, 1); + smp_call_function(op_axp_cpu_stop, NULL, 1); op_axp_cpu_stop(NULL); } diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 1c8137e7247b..8383155c8c82 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -7,6 +7,7 @@ config ARC def_bool y select ARC_TIMERS select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SYNC_DMA_FOR_CPU @@ -16,6 +17,7 @@ config ARC select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK + select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC) select GENERIC_CLOCKEVENTS select GENERIC_FIND_FIRST_BIT diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 17cf1c657cb3..7298ce84762e 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -321,14 +321,14 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3) */ typedef struct { - aligned_u64 counter; + s64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(a) { (a) } -static inline long long atomic64_read(const atomic64_t *v) +static inline s64 atomic64_read(const atomic64_t *v) { - unsigned long long val; + s64 val; __asm__ __volatile__( " ldd %0, [%1] \n" @@ -338,7 +338,7 @@ static inline long long atomic64_read(const atomic64_t *v) return val; } -static inline void atomic64_set(atomic64_t *v, long long a) +static inline void atomic64_set(atomic64_t *v, s64 a) { /* * This could have been a simple assignment in "C" but would need @@ -359,9 +359,9 @@ static inline void atomic64_set(atomic64_t *v, long long a) } #define ATOMIC64_OP(op, op1, op2) \ -static inline void atomic64_##op(long long a, atomic64_t *v) \ +static inline void atomic64_##op(s64 a, atomic64_t *v) \ { \ - unsigned long long val; \ + s64 val; \ \ __asm__ __volatile__( \ "1: \n" \ @@ -372,13 +372,13 @@ static inline void atomic64_##op(long long a, atomic64_t *v) \ " bnz 1b \n" \ : "=&r"(val) \ : "r"(&v->counter), "ir"(a) \ - : "cc"); \ + : "cc"); \ } \ #define ATOMIC64_OP_RETURN(op, op1, op2) \ -static inline long long atomic64_##op##_return(long long a, atomic64_t *v) \ +static inline s64 atomic64_##op##_return(s64 a, atomic64_t *v) \ { \ - unsigned long long val; \ + s64 val; \ \ smp_mb(); \ \ @@ -399,9 +399,9 @@ static inline long long atomic64_##op##_return(long long a, atomic64_t *v) \ } #define ATOMIC64_FETCH_OP(op, op1, op2) \ -static inline long long atomic64_fetch_##op(long long a, atomic64_t *v) \ +static inline s64 atomic64_fetch_##op(s64 a, atomic64_t *v) \ { \ - unsigned long long val, orig; \ + s64 val, orig; \ \ smp_mb(); \ \ @@ -441,10 +441,10 @@ ATOMIC64_OPS(xor, xor, xor) #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP -static inline long long -atomic64_cmpxchg(atomic64_t *ptr, long long expected, long long new) +static inline s64 +atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new) { - long long prev; + s64 prev; smp_mb(); @@ -464,9 +464,9 @@ atomic64_cmpxchg(atomic64_t *ptr, long long expected, long long new) return prev; } -static inline long long atomic64_xchg(atomic64_t *ptr, long long new) +static inline s64 atomic64_xchg(atomic64_t *ptr, s64 new) { - long long prev; + s64 prev; smp_mb(); @@ -492,9 +492,9 @@ static inline long long atomic64_xchg(atomic64_t *ptr, long long new) * the atomic variable, v, was not decremented. */ -static inline long long atomic64_dec_if_positive(atomic64_t *v) +static inline s64 atomic64_dec_if_positive(atomic64_t *v) { - long long val; + s64 val; smp_mb(); @@ -525,10 +525,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) * Atomically adds @a to @v, if it was not @u. * Returns the old value of @v */ -static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a, - long long u) +static inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { - long long old, temp; + s64 old, temp; smp_mb(); diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index ff321f7df716..e1889ce3faf9 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -97,7 +97,7 @@ fault: goto again; fail: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return ret; } @@ -310,7 +310,7 @@ int elf_check_arch(const struct elf32_hdr *x) eflags = x->e_flags; if ((eflags & EF_ARC_OSABI_MSK) != EF_ARC_OSABI_CURRENT) { pr_err("ABI mismatch - you need newer toolchain\n"); - force_sigsegv(SIGSEGV, current); + force_sigsegv(SIGSEGV); return 0; } diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index b895f889602a..3d57ed0d8535 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -194,7 +194,7 @@ SYSCALL_DEFINE0(rt_sigreturn) return regs->r0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index e9a5b259f405..57235e5c0cea 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -47,7 +47,7 @@ unhandled_exception(const char *str, struct pt_regs *regs, tsk->thread.fault_address = (__force unsigned int)addr; - force_sig_fault(signo, si_code, addr, tsk); + force_sig_fault(signo, si_code, addr); } else { /* If not due to copy_(to|from)_user, we are doomed */ diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 0bf1468c35a3..62c210e7ee4c 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -8,51 +8,15 @@ #include <asm/cacheflush.h> /* - * ARCH specific callbacks for generic noncoherent DMA ops (dma/noncoherent.c) + * ARCH specific callbacks for generic noncoherent DMA ops * - hardware IOC not available (or "dma-coherent" not set for device in DT) * - But still handle both coherent and non-coherent requests from caller * * For DMA coherent hardware (IOC) generic code suffices */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - unsigned long order = get_order(size); - struct page *page; - phys_addr_t paddr; - void *kvaddr; - bool need_coh = !(attrs & DMA_ATTR_NON_CONSISTENT); - - /* - * __GFP_HIGHMEM flag is cleared by upper layer functions - * (in include/linux/dma-mapping.h) so we should never get a - * __GFP_HIGHMEM here. - */ - BUG_ON(gfp & __GFP_HIGHMEM); - - page = alloc_pages(gfp | __GFP_ZERO, order); - if (!page) - return NULL; - - /* This is linear addr (0x8000_0000 based) */ - paddr = page_to_phys(page); - - *dma_handle = paddr; - - /* - * A coherent buffer needs MMU mapping to enforce non-cachability. - * kvaddr is kernel Virtual address (0x7000_0000 based). - */ - if (need_coh) { - kvaddr = ioremap_nocache(paddr, size); - if (kvaddr == NULL) { - __free_pages(page, order); - return NULL; - } - } else { - kvaddr = (void *)(u32)paddr; - } +void arch_dma_prep_coherent(struct page *page, size_t size) +{ /* * Evict any existing L1 and/or L2 lines for the backing page * in case it was used earlier as a normal "cached" page. @@ -63,28 +27,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * Currently flush_cache_vmap nukes the L1 cache completely which * will be optimized as a separate commit */ - if (need_coh) - dma_cache_wback_inv(paddr, size); - - return kvaddr; -} - -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - phys_addr_t paddr = dma_handle; - struct page *page = virt_to_page(paddr); - - if (!(attrs & DMA_ATTR_NON_CONSISTENT)) - iounmap((void __force __iomem *)vaddr); - - __free_pages(page, get_order(size)); -} - -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return __phys_to_pfn(dma_addr); + dma_cache_wback_inv(page_to_phys(page), size); } /* @@ -161,3 +104,9 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, dev_info(dev, "use %sncoherent DMA ops\n", dev->dma_coherent ? "" : "non"); } + +static int __init atomic_pool_init(void) +{ + return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL)); +} +postcore_initcall(atomic_pool_init); diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 8cca03480bb2..81e84426fe21 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -196,7 +196,7 @@ bad_area: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { tsk->thread.fault_address = address; - force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } @@ -231,5 +231,5 @@ do_sigbus: goto no_context; tsk->thread.fault_address = address; - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } diff --git a/arch/arc/plat-eznps/Kconfig b/arch/arc/plat-eznps/Kconfig index 2eaecfb063a7..a376a50d3fea 100644 --- a/arch/arc/plat-eznps/Kconfig +++ b/arch/arc/plat-eznps/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # menuconfig ARC_PLAT_EZNPS diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8869742a85df..2bf1ce39a96d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -4,6 +4,7 @@ config ARM default y select ARCH_32BIT_OFF_T select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -30,6 +31,7 @@ config ARM select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_IPC_PARSE_VERSION + select BINFMT_FLAT_ARGVP_ENVP_ON_STACK select BUILDTIME_EXTABLE_SORT if MMU select CLONE_BACKWARDS select CPU_PM if SUSPEND || CPU_IDLE @@ -73,6 +75,7 @@ config ARM select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD + select HAVE_FAST_GUP if ARM_LPAE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL @@ -1175,6 +1178,14 @@ config ARM_ERRATA_825619 DMB NSHST or DMB ISHST instruction followed by a mix of Cacheable and Device/Strongly-Ordered loads and stores might cause deadlock +config ARM_ERRATA_857271 + bool "ARM errata: A12: CPU might deadlock under some very rare internal conditions" + depends on CPU_V7 + help + This option enables the workaround for the 857271 Cortex-A12 + (all revs) erratum. Under very rare timing conditions, the CPU might + hang. The workaround is expected to have a < 1% performance impact. + config ARM_ERRATA_852421 bool "ARM errata: A17: DMB ST might fail to create order between stores" depends on CPU_V7 @@ -1196,6 +1207,16 @@ config ARM_ERRATA_852423 config option from the A12 erratum due to the way errata are checked for and handled. +config ARM_ERRATA_857272 + bool "ARM errata: A17: CPU might deadlock under some very rare internal conditions" + depends on CPU_V7 + help + This option enables the workaround for the 857272 Cortex-A17 erratum. + This erratum is not known to be fixed in any A17 revision. + This is identical to Cortex-A12 erratum 857271. It is a separate + config option from the A12 erratum due to the way errata are checked + for and handled. + endmenu source "arch/arm/common/Kconfig" @@ -1232,6 +1253,18 @@ config PCI_HOST_ITE8152 default y select DMABOUNCE +config ARM_ERRATA_814220 + bool "ARM errata: Cache maintenance by set/way operations can execute out of order" + depends on CPU_V7 + help + The v7 ARM states that all cache and branch predictor maintenance + operations that do not specify an address execute, relative to + each other, in program order. + However, because of this erratum, an L2 set/way cache maintenance + operation can overtake an L1 set/way cache maintenance operation. + This ERRATA only affected the Cortex-A7 and present in r0p2, r0p3, + r0p4, r0p5. + endmenu menu "Kernel Features" @@ -1263,7 +1296,7 @@ config SMP uniprocessor machines. On a uniprocessor machine, the kernel will run faster if you say N here. - See also <file:Documentation/x86/i386/IO-APIC.txt>, + See also <file:Documentation/x86/i386/IO-APIC.rst>, <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at <http://tldp.org/HOWTO/SMP-HOWTO.html>. @@ -1590,16 +1623,9 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool ARCH_SPARSEMEM_ENABLE -config ARCH_SELECT_MEMORY_MODEL - def_bool ARCH_SPARSEMEM_ENABLE - config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_GUP - def_bool y - depends on ARM_LPAE - config HIGHMEM bool "High Memory Support" depends on MMU @@ -2010,7 +2036,7 @@ config CRASH_DUMP kdump/kexec. The crash dump kernel must be compiled to a memory address not used by the main kernel - For more details see Documentation/kdump/kdump.txt + For more details see Documentation/kdump/kdump.rst config AUTO_ZRELADDR bool "Auto calculation of the decompressed kernel image address" diff --git a/arch/arm/boot/dts/armada-xp-98dx3236.dtsi b/arch/arm/boot/dts/armada-xp-98dx3236.dtsi index 59753470cd34..267d0c178e55 100644 --- a/arch/arm/boot/dts/armada-xp-98dx3236.dtsi +++ b/arch/arm/boot/dts/armada-xp-98dx3236.dtsi @@ -336,3 +336,11 @@ status = "disabled"; }; +&uart0 { + compatible = "marvell,armada-38x-uart"; +}; + +&uart1 { + compatible = "marvell,armada-38x-uart"; +}; + diff --git a/arch/arm/boot/dts/imx7ulp.dtsi b/arch/arm/boot/dts/imx7ulp.dtsi index d6b711011cba..e20483714be5 100644 --- a/arch/arm/boot/dts/imx7ulp.dtsi +++ b/arch/arm/boot/dts/imx7ulp.dtsi @@ -100,6 +100,29 @@ reg = <0x40000000 0x800000>; ranges; + crypto: crypto@40240000 { + compatible = "fsl,sec-v4.0"; + #address-cells = <1>; + #size-cells = <1>; + reg = <0x40240000 0x10000>; + ranges = <0 0x40240000 0x10000>; + clocks = <&pcc2 IMX7ULP_CLK_CAAM>, + <&scg1 IMX7ULP_CLK_NIC1_BUS_DIV>; + clock-names = "aclk", "ipg"; + + sec_jr0: jr0@1000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x1000 0x1000>; + interrupts = <GIC_SPI 54 IRQ_TYPE_LEVEL_HIGH>; + }; + + sec_jr1: jr1@2000 { + compatible = "fsl,sec-v4.0-job-ring"; + reg = <0x2000 0x1000>; + interrupts = <GIC_SPI 54 IRQ_TYPE_LEVEL_HIGH>; + }; + }; + lpuart4: serial@402d0000 { compatible = "fsl,imx7ulp-lpuart"; reg = <0x402d0000 0x1000>; diff --git a/arch/arm/boot/dts/rk3288-veyron.dtsi b/arch/arm/boot/dts/rk3288-veyron.dtsi index 1252522392c7..1d8bfed7830c 100644 --- a/arch/arm/boot/dts/rk3288-veyron.dtsi +++ b/arch/arm/boot/dts/rk3288-veyron.dtsi @@ -424,6 +424,7 @@ &usb_host1 { status = "okay"; + snps,need-phy-for-wake; }; &usb_otg { @@ -432,6 +433,7 @@ assigned-clocks = <&cru SCLK_USBPHY480M_SRC>; assigned-clock-parents = <&usbphy0>; dr_mode = "host"; + snps,need-phy-for-wake; }; &vopb { diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 13e561737ca8..746e1fce777e 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -539,16 +539,14 @@ static void bL_switcher_trace_trigger_cpu(void *__always_unused info) int bL_switcher_trace_trigger(void) { - int ret; - preempt_disable(); bL_switcher_trace_trigger_cpu(NULL); - ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true); + smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true); preempt_enable(); - return ret; + return 0; } EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger); diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index c95c54284da2..9b959afaaa12 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -9,6 +9,7 @@ CONFIG_MODULE_UNLOAD=y CONFIG_PARTITION_ADVANCED=y CONFIG_ARCH_EXYNOS=y CONFIG_ARCH_EXYNOS3=y +CONFIG_CPU_ICACHE_MISMATCH_WORKAROUND=y CONFIG_SMP=y CONFIG_BIG_LITTLE=y CONFIG_NR_CPUS=8 diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c index 48a89537b828..a8e9b534c8da 100644 --- a/arch/arm/crypto/chacha-neon-glue.c +++ b/arch/arm/crypto/chacha-neon-glue.c @@ -63,7 +63,7 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, } static int chacha_neon_stream_xor(struct skcipher_request *req, - struct chacha_ctx *ctx, u8 *iv) + const struct chacha_ctx *ctx, const u8 *iv) { struct skcipher_walk walk; u32 state[16]; diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c index 232eeab1ec37..8775aa42bbbe 100644 --- a/arch/arm/crypto/sha512-glue.c +++ b/arch/arm/crypto/sha512-glue.c @@ -34,7 +34,7 @@ int sha512_arm_update(struct shash_desc *desc, const u8 *data, (sha512_block_fn *)sha512_block_data_order); } -int sha512_arm_final(struct shash_desc *desc, u8 *out) +static int sha512_arm_final(struct shash_desc *desc, u8 *out) { sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_block_data_order); diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index a8f149ab45b8..6b2dc15b6dff 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += early_ioremap.h generic-y += emergency-restart.h generic-y += exec.h generic-y += extable.h +generic-y += flat.h generic-y += irq_regs.h generic-y += kdebug.h generic-y += local.h diff --git a/arch/arm/include/asm/arch_timer.h b/arch/arm/include/asm/arch_timer.h index 4b66ecd6be99..99175812d903 100644 --- a/arch/arm/include/asm/arch_timer.h +++ b/arch/arm/include/asm/arch_timer.h @@ -4,6 +4,7 @@ #include <asm/barrier.h> #include <asm/errno.h> +#include <asm/hwcap.h> #include <linux/clocksource.h> #include <linux/init.h> #include <linux/types.h> @@ -124,6 +125,15 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl) isb(); } +static inline void arch_timer_set_evtstrm_feature(void) +{ + elf_hwcap |= HWCAP_EVTSTRM; +} + +static inline bool arch_timer_have_evtstrm_feature(void) +{ + return elf_hwcap & HWCAP_EVTSTRM; +} #endif #endif diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 50c3ac5f0809..75bb2c543e59 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -246,15 +246,15 @@ ATOMIC_OPS(xor, ^=, eor) #ifndef CONFIG_GENERIC_ATOMIC64 typedef struct { - long long counter; + s64 counter; } atomic64_t; #define ATOMIC64_INIT(i) { (i) } #ifdef CONFIG_ARM_LPAE -static inline long long atomic64_read(const atomic64_t *v) +static inline s64 atomic64_read(const atomic64_t *v) { - long long result; + s64 result; __asm__ __volatile__("@ atomic64_read\n" " ldrd %0, %H0, [%1]" @@ -265,7 +265,7 @@ static inline long long atomic64_read(const atomic64_t *v) return result; } -static inline void atomic64_set(atomic64_t *v, long long i) +static inline void atomic64_set(atomic64_t *v, s64 i) { __asm__ __volatile__("@ atomic64_set\n" " strd %2, %H2, [%1]" @@ -274,9 +274,9 @@ static inline void atomic64_set(atomic64_t *v, long long i) ); } #else -static inline long long atomic64_read(const atomic64_t *v) +static inline s64 atomic64_read(const atomic64_t *v) { - long long result; + s64 result; __asm__ __volatile__("@ atomic64_read\n" " ldrexd %0, %H0, [%1]" @@ -287,9 +287,9 @@ static inline long long atomic64_read(const atomic64_t *v) return result; } -static inline void atomic64_set(atomic64_t *v, long long i) +static inline void atomic64_set(atomic64_t *v, s64 i) { - long long tmp; + s64 tmp; prefetchw(&v->counter); __asm__ __volatile__("@ atomic64_set\n" @@ -304,9 +304,9 @@ static inline void atomic64_set(atomic64_t *v, long long i) #endif #define ATOMIC64_OP(op, op1, op2) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ +static inline void atomic64_##op(s64 i, atomic64_t *v) \ { \ - long long result; \ + s64 result; \ unsigned long tmp; \ \ prefetchw(&v->counter); \ @@ -323,10 +323,10 @@ static inline void atomic64_##op(long long i, atomic64_t *v) \ } \ #define ATOMIC64_OP_RETURN(op, op1, op2) \ -static inline long long \ -atomic64_##op##_return_relaxed(long long i, atomic64_t *v) \ +static inline s64 \ +atomic64_##op##_return_relaxed(s64 i, atomic64_t *v) \ { \ - long long result; \ + s64 result; \ unsigned long tmp; \ \ prefetchw(&v->counter); \ @@ -346,10 +346,10 @@ atomic64_##op##_return_relaxed(long long i, atomic64_t *v) \ } #define ATOMIC64_FETCH_OP(op, op1, op2) \ -static inline long long \ -atomic64_fetch_##op##_relaxed(long long i, atomic64_t *v) \ +static inline s64 \ +atomic64_fetch_##op##_relaxed(s64 i, atomic64_t *v) \ { \ - long long result, val; \ + s64 result, val; \ unsigned long tmp; \ \ prefetchw(&v->counter); \ @@ -403,10 +403,9 @@ ATOMIC64_OPS(xor, eor, eor) #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP -static inline long long -atomic64_cmpxchg_relaxed(atomic64_t *ptr, long long old, long long new) +static inline s64 atomic64_cmpxchg_relaxed(atomic64_t *ptr, s64 old, s64 new) { - long long oldval; + s64 oldval; unsigned long res; prefetchw(&ptr->counter); @@ -427,9 +426,9 @@ atomic64_cmpxchg_relaxed(atomic64_t *ptr, long long old, long long new) } #define atomic64_cmpxchg_relaxed atomic64_cmpxchg_relaxed -static inline long long atomic64_xchg_relaxed(atomic64_t *ptr, long long new) +static inline s64 atomic64_xchg_relaxed(atomic64_t *ptr, s64 new) { - long long result; + s64 result; unsigned long tmp; prefetchw(&ptr->counter); @@ -447,9 +446,9 @@ static inline long long atomic64_xchg_relaxed(atomic64_t *ptr, long long new) } #define atomic64_xchg_relaxed atomic64_xchg_relaxed -static inline long long atomic64_dec_if_positive(atomic64_t *v) +static inline s64 atomic64_dec_if_positive(atomic64_t *v) { - long long result; + s64 result; unsigned long tmp; smp_mb(); @@ -475,10 +474,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) } #define atomic64_dec_if_positive atomic64_dec_if_positive -static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a, - long long u) +static inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { - long long oldval, newval; + s64 oldval, newval; unsigned long tmp; smp_mb(); diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index 36c951dd23b8..deef4d0cb3b5 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -85,7 +85,7 @@ void hook_ifault_code(int nr, int (*fn)(unsigned long, unsigned int, extern asmlinkage void c_backtrace(unsigned long fp, int pmode); struct mm_struct; -extern void show_pte(struct mm_struct *mm, unsigned long addr); +void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr); extern void __show_regs(struct pt_regs *); #endif diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index d6667b8cfca5..7114b9aa46b8 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -476,4 +476,11 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size) void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, void *kaddr, unsigned long len); + +#ifdef CONFIG_CPU_ICACHE_MISMATCH_WORKAROUND +void check_cpu_icache_size(int cpuid); +#else +static inline void check_cpu_icache_size(int cpuid) { } +#endif + #endif diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 03ba90ffc0f8..7e0486ad1318 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -89,13 +89,6 @@ static inline dma_addr_t virt_to_dma(struct device *dev, void *addr) } #endif -/* The ARM override for dma_max_pfn() */ -static inline unsigned long dma_max_pfn(struct device *dev) -{ - return dma_to_pfn(dev, *dev->dma_mask); -} -#define dma_max_pfn(dev) dma_max_pfn(dev) - /* do not use this function in a driver */ static inline bool is_device_dma_coherent(struct device *dev) { diff --git a/arch/arm/include/asm/flat.h b/arch/arm/include/asm/flat.h deleted file mode 100644 index f0c75ddeea23..000000000000 --- a/arch/arm/include/asm/flat.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * arch/arm/include/asm/flat.h -- uClinux flat-format executables - */ - -#ifndef __ARM_FLAT_H__ -#define __ARM_FLAT_H__ - -#include <linux/uaccess.h> - -#define flat_argvp_envp_on_stack() 1 -#define flat_old_ram_flag(flags) (flags) -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) - -static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) -{ -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - return copy_from_user(addr, rp, 4) ? -EFAULT : 0; -#else - return get_user(*addr, rp); -#endif -} - -static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel) -{ -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - return copy_to_user(rp, &addr, 4) ? -EFAULT : 0; -#else - return put_user(addr, rp); -#endif -} - -#define flat_get_relocate_addr(rel) (rel) -#define flat_set_persistent(relval, p) 0 - -#endif /* __ARM_FLAT_H__ */ diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 6b7644a383f6..40002416efec 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -271,6 +271,16 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) return vcpu_cp15(vcpu, c0_MPIDR) & MPIDR_HWID_BITMASK; } +static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu) +{ + return false; +} + +static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu, + bool flag) +{ +} + static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) { *vcpu_cpsr(vcpu) |= PSR_E_BIT; diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index f80418ddeb60..8a37c8e89777 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -15,7 +15,6 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> #include <asm/fpstate.h> -#include <asm/smp_plat.h> #include <kvm/arm_arch_timer.h> #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -147,11 +146,10 @@ struct kvm_host_data { typedef struct kvm_host_data kvm_host_data_t; -static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt, - int cpu) +static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { /* The host's MPIDR is immutable, so let's set it up at boot time */ - cpu_ctxt->cp15[c0_MPIDR] = cpu_logical_map(cpu); + cpu_ctxt->cp15[c0_MPIDR] = read_cpuid_mpidr(); } struct vcpu_reset_state { @@ -362,7 +360,11 @@ static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_vhe_guest_enter(void) {} static inline void kvm_arm_vhe_guest_exit(void) {} -static inline bool kvm_arm_harden_branch_predictor(void) +#define KVM_BP_HARDEN_UNKNOWN -1 +#define KVM_BP_HARDEN_WA_NEEDED 0 +#define KVM_BP_HARDEN_NOT_REQUIRED 1 + +static inline int kvm_arm_harden_branch_predictor(void) { switch(read_cpuid_part()) { #ifdef CONFIG_HARDEN_BRANCH_PREDICTOR @@ -370,10 +372,12 @@ static inline bool kvm_arm_harden_branch_predictor(void) case ARM_CPU_PART_CORTEX_A12: case ARM_CPU_PART_CORTEX_A15: case ARM_CPU_PART_CORTEX_A17: - return true; + return KVM_BP_HARDEN_WA_NEEDED; #endif + case ARM_CPU_PART_CORTEX_A7: + return KVM_BP_HARDEN_NOT_REQUIRED; default: - return false; + return KVM_BP_HARDEN_UNKNOWN; } } diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 71ac1c8d101c..40e9034db601 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -82,13 +82,14 @@ #define VFP_FPEXC __ACCESS_VFP(FPEXC) /* AArch64 compatibility macros, only for the timer so far */ -#define read_sysreg_el0(r) read_sysreg(r##_el0) -#define write_sysreg_el0(v, r) write_sysreg(v, r##_el0) +#define read_sysreg_el0(r) read_sysreg(r##_EL0) +#define write_sysreg_el0(v, r) write_sysreg(v, r##_EL0) + +#define SYS_CNTP_CTL_EL0 CNTP_CTL +#define SYS_CNTP_CVAL_EL0 CNTP_CVAL +#define SYS_CNTV_CTL_EL0 CNTV_CTL +#define SYS_CNTV_CVAL_EL0 CNTV_CVAL -#define cntp_ctl_el0 CNTP_CTL -#define cntp_cval_el0 CNTP_CVAL -#define cntv_ctl_el0 CNTV_CTL -#define cntv_cval_el0 CNTV_CVAL #define cntvoff_el2 CNTVOFF #define cnthctl_el2 CNTHCTL diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index c038cff6fdd3..a2a68b751971 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -54,8 +54,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) - static inline void clean_pte_table(pte_t *pte) { clean_dcache_area(pte + PTE_HWTABLE_PTRS, PTE_HWTABLE_SIZE); @@ -77,54 +75,41 @@ static inline void clean_pte_table(pte_t *pte) * | h/w pt 1 | * +------------+ */ + +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include <asm-generic/pgalloc.h> + static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; + pte_t *pte = __pte_alloc_one_kernel(mm); - pte = (pte_t *)__get_free_page(PGALLOC_GFP); if (pte) clean_pte_table(pte); return pte; } +#ifdef CONFIG_HIGHPTE +#define PGTABLE_HIGHMEM __GFP_HIGHMEM +#else +#define PGTABLE_HIGHMEM 0 +#endif + static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = __pte_alloc_one(mm, GFP_PGTABLE_USER | PGTABLE_HIGHMEM); if (!pte) return NULL; if (!PageHighMem(pte)) clean_pte_table(page_address(pte)); - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } return pte; } -/* - * Free one PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - if (pte) - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte, pmdval_t prot) { diff --git a/arch/arm/include/asm/ptdump.h b/arch/arm/include/asm/ptdump.h index 3ebf9718288d..0c2d3d0d4cc6 100644 --- a/arch/arm/include/asm/ptdump.h +++ b/arch/arm/include/asm/ptdump.h @@ -21,13 +21,10 @@ struct ptdump_info { void ptdump_walk_pgd(struct seq_file *s, struct ptdump_info *info); #ifdef CONFIG_ARM_PTDUMP_DEBUGFS -int ptdump_debugfs_register(struct ptdump_info *info, const char *name); +void ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else -static inline int ptdump_debugfs_register(struct ptdump_info *info, - const char *name) -{ - return 0; -} +static inline void ptdump_debugfs_register(struct ptdump_info *info, + const char *name) { } #endif /* CONFIG_ARM_PTDUMP_DEBUGFS */ void ptdump_check_wx(void); diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index a00288d75ee6..172b08ff3760 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -30,7 +30,7 @@ static inline int __in_irqentry_text(unsigned long ptr) extern void __init early_trap_init(void *); extern void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame); -extern void ptrace_break(struct task_struct *tsk, struct pt_regs *regs); +extern void ptrace_break(struct pt_regs *regs); extern void *vectors_page; diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 9fb00973c608..3676e82cf95c 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -37,6 +37,7 @@ #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 /* * Unimplemented (or alternatively implemented) syscalls diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 4602464ebdfb..a4217c1a5d01 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -214,6 +214,18 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_FW_REG(r) (KVM_REG_ARM | KVM_REG_SIZE_U64 | \ KVM_REG_ARM_FW | ((r) & 0xffff)) #define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 KVM_REG_ARM_FW_REG(1) + /* Higher values mean better protection. */ +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2) + /* Higher values mean better protection. */ +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL 2 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ed005870671a..e57dbcc89123 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -8,8 +8,7 @@ #include <asm/mach/map.h> #include <asm/mmu_context.h> -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; pte_t pte = *ptep; diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index afcb4d3b14dc..324352787aea 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -198,15 +198,15 @@ void ptrace_disable(struct task_struct *child) /* * Handle hitting a breakpoint. */ -void ptrace_break(struct task_struct *tsk, struct pt_regs *regs) +void ptrace_break(struct pt_regs *regs) { force_sig_fault(SIGTRAP, TRAP_BRKPT, - (void __user *)instruction_pointer(regs), tsk); + (void __user *)instruction_pointer(regs)); } static int break_trap(struct pt_regs *regs, unsigned int instr) { - ptrace_break(current, regs); + ptrace_break(regs); return 0; } diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 3ca71d679aec..09f6fdd41974 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -247,7 +247,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs) return regs->ARM_r0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -280,7 +280,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) return regs->ARM_r0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index a137608cd197..aab8ba40ce38 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -372,6 +372,7 @@ static void smp_store_cpu_info(unsigned int cpuid) cpu_info->cpuid = read_cpuid_id(); store_cpu_topology(cpuid); + check_cpu_icache_size(cpuid); } /* diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 60e375ce1ab2..d17cb1e6d679 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu) topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); pr_info("CPU%u: update cpu_capacity %lu\n", - cpu, topology_get_cpu_scale(NULL, cpu)); + cpu, topology_get_cpu_scale(cpu)); } #else diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 7e2f1cba84e5..c053abd1fb53 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -369,7 +369,7 @@ void arm_notify_die(const char *str, struct pt_regs *regs, current->thread.error_code = err; current->thread.trap_no = trap; - force_sig_fault(signo, si_code, addr, current); + force_sig_fault(signo, si_code, addr); } else { die(str, regs, err); } @@ -603,7 +603,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) case NR(breakpoint): /* SWI BREAK_POINT */ regs->ARM_pc -= thumb_mode(regs) ? 2 : 4; - ptrace_break(current, regs); + ptrace_break(regs); return regs->ARM_r0; /* @@ -722,10 +722,11 @@ baddataabort(int code, unsigned long instr, struct pt_regs *regs) #ifdef CONFIG_DEBUG_USER if (user_debug & UDBG_BADABORT) { + pr_err("8<--- cut here ---\n"); pr_err("[%d] %s: bad data abort: code %d instr 0x%08lx\n", task_pid_nr(current), current->comm, code, instr); dump_instr(KERN_ERR, regs); - show_pte(current->mm, addr); + show_pte(KERN_ERR, current->mm, addr); } #endif diff --git a/arch/arm/mach-davinci/board-da830-evm.c b/arch/arm/mach-davinci/board-da830-evm.c index 51a892702e27..a273ab25c668 100644 --- a/arch/arm/mach-davinci/board-da830-evm.c +++ b/arch/arm/mach-davinci/board-da830-evm.c @@ -61,6 +61,9 @@ static struct regulator_consumer_supply da830_evm_usb_supplies[] = { static struct regulator_init_data da830_evm_usb_vbus_data = { .consumer_supplies = da830_evm_usb_supplies, .num_consumer_supplies = ARRAY_SIZE(da830_evm_usb_supplies), + .constraints = { + .valid_ops_mask = REGULATOR_CHANGE_STATUS, + }, }; static struct fixed_voltage_config da830_evm_usb_vbus = { @@ -88,7 +91,7 @@ static struct gpiod_lookup_table da830_evm_usb_oc_gpio_lookup = { static struct gpiod_lookup_table da830_evm_usb_vbus_gpio_lookup = { .dev_id = "reg-fixed-voltage.0", .table = { - GPIO_LOOKUP("davinci_gpio", ON_BD_USB_DRV, "vbus", 0), + GPIO_LOOKUP("davinci_gpio", ON_BD_USB_DRV, NULL, 0), { } }, }; diff --git a/arch/arm/mach-davinci/board-omapl138-hawk.c b/arch/arm/mach-davinci/board-omapl138-hawk.c index db177a6a7e48..5390a8630cf0 100644 --- a/arch/arm/mach-davinci/board-omapl138-hawk.c +++ b/arch/arm/mach-davinci/board-omapl138-hawk.c @@ -306,6 +306,9 @@ static struct regulator_consumer_supply hawk_usb_supplies[] = { static struct regulator_init_data hawk_usb_vbus_data = { .consumer_supplies = hawk_usb_supplies, .num_consumer_supplies = ARRAY_SIZE(hawk_usb_supplies), + .constraints = { + .valid_ops_mask = REGULATOR_CHANGE_STATUS, + }, }; static struct fixed_voltage_config hawk_usb_vbus = { diff --git a/arch/arm/mach-omap1/ams-delta-fiq.c b/arch/arm/mach-omap1/ams-delta-fiq.c index 0af2bf6f9933..43899fa56674 100644 --- a/arch/arm/mach-omap1/ams-delta-fiq.c +++ b/arch/arm/mach-omap1/ams-delta-fiq.c @@ -11,6 +11,7 @@ * in the MontaVista 2.4 kernel (and the Amstrad changes therein) */ #include <linux/gpio/consumer.h> +#include <linux/gpio/machine.h> #include <linux/gpio/driver.h> #include <linux/interrupt.h> #include <linux/irq.h> @@ -99,7 +100,8 @@ void __init ams_delta_init_fiq(struct gpio_chip *chip, } for (i = 0; i < ARRAY_SIZE(irq_data); i++) { - gpiod = gpiochip_request_own_desc(chip, i, pin_name[i], 0); + gpiod = gpiochip_request_own_desc(chip, i, pin_name[i], + GPIO_ACTIVE_HIGH, GPIOD_IN); if (IS_ERR(gpiod)) { pr_err("%s: failed to get GPIO pin %d (%ld)\n", __func__, i, PTR_ERR(gpiod)); diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c index 36498ea1b2f3..e47a6fbcfd6e 100644 --- a/arch/arm/mach-omap1/board-ams-delta.c +++ b/arch/arm/mach-omap1/board-ams-delta.c @@ -10,6 +10,7 @@ */ #include <linux/gpio/driver.h> #include <linux/gpio/machine.h> +#include <linux/gpio/consumer.h> #include <linux/gpio.h> #include <linux/kernel.h> #include <linux/init.h> @@ -606,12 +607,12 @@ static void __init modem_assign_irq(struct gpio_chip *chip) struct gpio_desc *gpiod; gpiod = gpiochip_request_own_desc(chip, AMS_DELTA_GPIO_PIN_MODEM_IRQ, - "modem_irq", 0); + "modem_irq", GPIO_ACTIVE_HIGH, + GPIOD_IN); if (IS_ERR(gpiod)) { pr_err("%s: modem IRQ GPIO request failed (%ld)\n", __func__, PTR_ERR(gpiod)); } else { - gpiod_direction_input(gpiod); ams_delta_modem_ports[0].irq = gpiod_to_irq(gpiod); } } diff --git a/arch/arm/mach-omap1/clock.c b/arch/arm/mach-omap1/clock.c index 406fd2a9a88f..bd5be82101f3 100644 --- a/arch/arm/mach-omap1/clock.c +++ b/arch/arm/mach-omap1/clock.c @@ -987,84 +987,44 @@ static int debug_clock_show(struct seq_file *s, void *unused) DEFINE_SHOW_ATTRIBUTE(debug_clock); -static int clk_debugfs_register_one(struct clk *c) +static void clk_debugfs_register_one(struct clk *c) { - int err; struct dentry *d; struct clk *pa = c->parent; d = debugfs_create_dir(c->name, pa ? pa->dent : clk_debugfs_root); - if (!d) - return -ENOMEM; c->dent = d; - d = debugfs_create_u8("usecount", S_IRUGO, c->dent, &c->usecount); - if (!d) { - err = -ENOMEM; - goto err_out; - } - d = debugfs_create_ulong("rate", S_IRUGO, c->dent, &c->rate); - if (!d) { - err = -ENOMEM; - goto err_out; - } - d = debugfs_create_x8("flags", S_IRUGO, c->dent, &c->flags); - if (!d) { - err = -ENOMEM; - goto err_out; - } - return 0; - -err_out: - debugfs_remove_recursive(c->dent); - return err; + debugfs_create_u8("usecount", S_IRUGO, c->dent, &c->usecount); + debugfs_create_ulong("rate", S_IRUGO, c->dent, &c->rate); + debugfs_create_x8("flags", S_IRUGO, c->dent, &c->flags); } -static int clk_debugfs_register(struct clk *c) +static void clk_debugfs_register(struct clk *c) { - int err; struct clk *pa = c->parent; - if (pa && !pa->dent) { - err = clk_debugfs_register(pa); - if (err) - return err; - } + if (pa && !pa->dent) + clk_debugfs_register(pa); - if (!c->dent) { - err = clk_debugfs_register_one(c); - if (err) - return err; - } - return 0; + if (!c->dent) + clk_debugfs_register_one(c); } static int __init clk_debugfs_init(void) { struct clk *c; struct dentry *d; - int err; d = debugfs_create_dir("clock", NULL); - if (!d) - return -ENOMEM; clk_debugfs_root = d; - list_for_each_entry(c, &clocks, node) { - err = clk_debugfs_register(c); - if (err) - goto err_out; - } + list_for_each_entry(c, &clocks, node) + clk_debugfs_register(c); - d = debugfs_create_file("summary", S_IRUGO, - d, NULL, &debug_clock_fops); - if (!d) - return -ENOMEM; + debugfs_create_file("summary", S_IRUGO, d, NULL, &debug_clock_fops); return 0; -err_out: - debugfs_remove_recursive(clk_debugfs_root); - return err; } late_initcall(clk_debugfs_init); diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c index 998075d3ef86..d068958d6f8a 100644 --- a/arch/arm/mach-omap1/pm.c +++ b/arch/arm/mach-omap1/pm.c @@ -539,11 +539,8 @@ static void omap_pm_init_debugfs(void) struct dentry *d; d = debugfs_create_dir("pm_debug", NULL); - if (!d) - return; - - (void) debugfs_create_file("omap_pm", S_IWUSR | S_IRUGO, - d, NULL, &omap_pm_debug_fops); + debugfs_create_file("omap_pm", S_IWUSR | S_IRUGO, d, NULL, + &omap_pm_debug_fops); } #endif /* CONFIG_DEBUG_FS */ diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c index fe6ec9b580b9..fceb1e525d26 100644 --- a/arch/arm/mach-omap2/pm-debug.c +++ b/arch/arm/mach-omap2/pm-debug.c @@ -190,9 +190,8 @@ static int __init pwrdms_setup(struct powerdomain *pwrdm, void *dir) return 0; d = debugfs_create_dir(pwrdm->name, (struct dentry *)dir); - if (d) - (void) debugfs_create_file("suspend", S_IRUGO|S_IWUSR, d, - (void *)pwrdm, &pwrdm_suspend_fops); + debugfs_create_file("suspend", S_IRUGO|S_IWUSR, d, pwrdm, + &pwrdm_suspend_fops); return 0; } @@ -230,16 +229,14 @@ static int __init pm_dbg_init(void) return 0; d = debugfs_create_dir("pm_debug", NULL); - if (!d) - return -EINVAL; - (void) debugfs_create_file("count", 0444, d, NULL, &pm_dbg_counters_fops); - (void) debugfs_create_file("time", 0444, d, NULL, &pm_dbg_timers_fops); + debugfs_create_file("count", 0444, d, NULL, &pm_dbg_counters_fops); + debugfs_create_file("time", 0444, d, NULL, &pm_dbg_timers_fops); pwrdm_for_each(pwrdms_setup, (void *)d); - (void) debugfs_create_file("enable_off_mode", S_IRUGO | S_IWUSR, d, - &enable_off_mode, &pm_dbg_option_fops); + debugfs_create_file("enable_off_mode", S_IRUGO | S_IWUSR, d, + &enable_off_mode, &pm_dbg_option_fops); pm_dbg_init_done = 1; return 0; diff --git a/arch/arm/mach-pxa/am200epd.c b/arch/arm/mach-pxa/am200epd.c index 50e18ed37fa6..cac0bb09db14 100644 --- a/arch/arm/mach-pxa/am200epd.c +++ b/arch/arm/mach-pxa/am200epd.c @@ -347,8 +347,17 @@ int __init am200_init(void) { int ret; - /* before anything else, we request notification for any fb - * creation events */ + /* + * Before anything else, we request notification for any fb + * creation events. + * + * FIXME: This is terrible and needs to be nuked. The notifier is used + * to get at the fb base address from the boot splash fb driver, which + * is then passed to metronomefb. Instaed of metronomfb or this board + * support file here figuring this out on their own. + * + * See also the #ifdef in fbmem.c. + */ fb_register_client(&am200_fb_notif); pxa2xx_mfp_config(ARRAY_AND_SIZE(am200_pin_config)); diff --git a/arch/arm/mach-s3c64xx/mach-crag6410.c b/arch/arm/mach-s3c64xx/mach-crag6410.c index 379424d72ae7..8ec6a4f5eb05 100644 --- a/arch/arm/mach-s3c64xx/mach-crag6410.c +++ b/arch/arm/mach-s3c64xx/mach-crag6410.c @@ -15,6 +15,7 @@ #include <linux/io.h> #include <linux/init.h> #include <linux/gpio.h> +#include <linux/gpio/machine.h> #include <linux/leds.h> #include <linux/delay.h> #include <linux/mmc/host.h> @@ -398,7 +399,6 @@ static struct pca953x_platform_data crag6410_pca_data = { /* VDDARM is controlled by DVS1 connected to GPK(0) */ static struct wm831x_buckv_pdata vddarm_pdata = { .dvs_control_src = 1, - .dvs_gpio = S3C64XX_GPK(0), }; static struct regulator_consumer_supply vddarm_consumers[] = { @@ -596,6 +596,24 @@ static struct wm831x_pdata crag_pmic_pdata = { .touch = &touch_pdata, }; +/* + * VDDARM is eventually ending up as a regulator hanging on the MFD cell device + * "wm831x-buckv.1" spawn from drivers/mfd/wm831x-core.c. + * + * From the note on the platform data we can see that this is clearly DVS1 + * and assigned as dcdc1 resource to the MFD core which sets .id of the cell + * spawning the DVS1 platform device to 1, then the cell platform device + * name is calculated from 10*instance + id resulting in the device name + * "wm831x-buckv.11" + */ +static struct gpiod_lookup_table crag_pmic_gpiod_table = { + .dev_id = "wm831x-buckv.11", + .table = { + GPIO_LOOKUP("GPIOK", 0, "dvs", GPIO_ACTIVE_HIGH), + { }, + }, +}; + static struct i2c_board_info i2c_devs0[] = { { I2C_BOARD_INFO("24c08", 0x50), }, { I2C_BOARD_INFO("tca6408", 0x20), @@ -836,6 +854,7 @@ static void __init crag6410_machine_init(void) s3c_fb_set_platdata(&crag6410_lcd_pdata); dwc2_hsotg_set_platdata(&crag6410_hsotg_pdata); + gpiod_add_lookup_table(&crag_pmic_gpiod_table); i2c_register_board_info(0, i2c_devs0, ARRAY_SIZE(i2c_devs0)); i2c_register_board_info(1, i2c_devs1, ARRAY_SIZE(i2c_devs1)); diff --git a/arch/arm/mach-stm32/Kconfig b/arch/arm/mach-stm32/Kconfig index 36e6c68c0b57..05d6b5aada80 100644 --- a/arch/arm/mach-stm32/Kconfig +++ b/arch/arm/mach-stm32/Kconfig @@ -44,6 +44,7 @@ if ARCH_MULTI_V7 config MACH_STM32MP157 bool "STMicroelectronics STM32MP157" + select ARM_ERRATA_814220 default y endif # ARMv7-A diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index b169e580bf82..cc798115aa9b 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -780,6 +780,14 @@ config CPU_ICACHE_DISABLE Say Y here to disable the processor instruction cache. Unless you have a reason not to or are unsure, say N. +config CPU_ICACHE_MISMATCH_WORKAROUND + bool "Workaround for I-Cache line size mismatch between CPU cores" + depends on SMP && CPU_V7 + help + Some big.LITTLE systems have I-Cache line size mismatch between + LITTLE and big cores. Say Y here to enable a workaround for + proper I-Cache support on such systems. If unsure, say N. + config CPU_DCACHE_DISABLE bool "Disable D-Cache (C-bit)" depends on (CPU_CP15 && !SMP) || CPU_V7M diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 6067fa4de22b..8cdb78642e93 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -945,7 +945,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) goto fixup; if (ai_usermode & UM_SIGNAL) { - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); } else { /* * We're about to disable the alignment trap and return to diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 8c83b4586883..0ee8fc4b4672 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -16,6 +16,14 @@ #include "proc-macros.S" +#ifdef CONFIG_CPU_ICACHE_MISMATCH_WORKAROUND +.globl icache_size + .data + .align 2 +icache_size: + .long 64 + .text +#endif /* * The secondary kernel init calls v7_flush_dcache_all before it enables * the L1; however, the L1 comes out of reset in an undefined state, so @@ -160,6 +168,9 @@ loop2: skip: add r10, r10, #2 @ increment cache number cmp r3, r10 +#ifdef CONFIG_ARM_ERRATA_814220 + dsb +#endif bgt flush_levels finished: mov r10, #0 @ switch back to cache level 0 @@ -281,7 +292,12 @@ ENTRY(v7_coherent_user_range) cmp r12, r1 blo 1b dsb ishst +#ifdef CONFIG_CPU_ICACHE_MISMATCH_WORKAROUND + ldr r3, =icache_size + ldr r2, [r3, #0] +#else icache_line_size r2, r3 +#endif sub r3, r2, #1 bic r12, r0, r3 2: diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index 1aea01ba1262..52b82559d99b 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -35,18 +35,7 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t size, unsigned long attrs) { - void *ret; - - /* - * Try generic allocator first if we are advertised that - * consistency is not required. - */ - - if (attrs & DMA_ATTR_NON_CONSISTENT) - return dma_direct_alloc_pages(dev, size, dma_handle, gfp, - attrs); - - ret = dma_alloc_from_global_coherent(size, dma_handle); + void *ret = dma_alloc_from_global_coherent(size, dma_handle); /* * dma_alloc_from_global_coherent() may fail because: @@ -66,16 +55,9 @@ static void arm_nommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (attrs & DMA_ATTR_NON_CONSISTENT) { - dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); - } else { - int ret = dma_release_from_global_coherent(get_order(size), - cpu_addr); - - WARN_ON_ONCE(ret == 0); - } + int ret = dma_release_from_global_coherent(get_order(size), cpu_addr); - return; + WARN_ON_ONCE(ret == 0); } static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 439bb6a59a04..4789c60a86e3 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -216,25 +216,7 @@ EXPORT_SYMBOL(arm_coherent_dma_ops); static int __dma_supported(struct device *dev, u64 mask, bool warn) { - unsigned long max_dma_pfn; - - /* - * If the mask allows for more memory than we can address, - * and we actually have that much memory, then we must - * indicate that DMA to this device is not supported. - */ - if (sizeof(mask) != sizeof(dma_addr_t) && - mask > (dma_addr_t)~0 && - dma_to_pfn(dev, ~0) < max_pfn - 1) { - if (warn) { - dev_warn(dev, "Coherent DMA mask %#llx is larger than dma_addr_t allows\n", - mask); - dev_warn(dev, "Driver did not use or check the return value from dma_set_coherent_mask()?\n"); - } - return 0; - } - - max_dma_pfn = min(max_pfn, arm_dma_pfn_limit); + unsigned long max_dma_pfn = min(max_pfn, arm_dma_pfn_limit); /* * Translate the device's DMA mask to a PFN limit. This @@ -493,8 +475,7 @@ void __init dma_contiguous_remap(void) } } -static int __dma_update_pte(pte_t *pte, pgtable_t token, unsigned long addr, - void *data) +static int __dma_update_pte(pte_t *pte, unsigned long addr, void *data) { struct page *page = virt_to_page(addr); pgprot_t prot = *(pgprot_t *)data; diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 006d27ee4fc6..7d6291f23251 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -446,7 +446,7 @@ void ptdump_check_wx(void) static int ptdump_init(void) { ptdump_initialize(); - return ptdump_debugfs_register(&kernel_ptdump_info, - "kernel_page_tables"); + ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables"); + return 0; } __initcall(ptdump_init); diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 0048eadd0681..0e417233dad7 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -53,17 +53,16 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) * This is useful to dump out the page tables associated with * 'addr' in mm 'mm'. */ -void show_pte(struct mm_struct *mm, unsigned long addr) +void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; if (!mm) mm = &init_mm; - pr_alert("pgd = %p\n", mm->pgd); + printk("%spgd = %p\n", lvl, mm->pgd); pgd = pgd_offset(mm, addr); - pr_alert("[%08lx] *pgd=%08llx", - addr, (long long)pgd_val(*pgd)); + printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd)); do { pud_t *pud; @@ -118,7 +117,7 @@ void show_pte(struct mm_struct *mm, unsigned long addr) pr_cont("\n"); } #else /* CONFIG_MMU */ -void show_pte(struct mm_struct *mm, unsigned long addr) +void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) { } #endif /* CONFIG_MMU */ @@ -139,11 +138,12 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, * No handler, we'll have to terminate things with extreme prejudice. */ bust_spinlocks(1); + pr_alert("8<--- cut here ---\n"); pr_alert("Unable to handle kernel %s at virtual address %08lx\n", (addr < PAGE_SIZE) ? "NULL pointer dereference" : "paging request", addr); - show_pte(mm, addr); + show_pte(KERN_ALERT, mm, addr); die("Oops", regs, fsr); bust_spinlocks(0); do_exit(SIGKILL); @@ -154,19 +154,21 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, * User mode accesses just cause a SIGSEGV */ static void -__do_user_fault(struct task_struct *tsk, unsigned long addr, - unsigned int fsr, unsigned int sig, int code, - struct pt_regs *regs) +__do_user_fault(unsigned long addr, unsigned int fsr, unsigned int sig, + int code, struct pt_regs *regs) { + struct task_struct *tsk = current; + if (addr > TASK_SIZE) harden_branch_predictor(); #ifdef CONFIG_DEBUG_USER if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) || ((user_debug & UDBG_BUS) && (sig == SIGBUS))) { - printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n", + pr_err("8<--- cut here ---\n"); + pr_err("%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n", tsk->comm, sig, addr, fsr); - show_pte(tsk->mm, addr); + show_pte(KERN_ERR, tsk->mm, addr); show_regs(regs); } #endif @@ -180,7 +182,7 @@ __do_user_fault(struct task_struct *tsk, unsigned long addr, tsk->thread.address = addr; tsk->thread.error_code = fsr; tsk->thread.trap_no = 14; - force_sig_fault(sig, code, (void __user *)addr, tsk); + force_sig_fault(sig, code, (void __user *)addr); } void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) @@ -193,7 +195,7 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * have no context to handle this fault with. */ if (user_mode(regs)) - __do_user_fault(tsk, addr, fsr, SIGSEGV, SEGV_MAPERR, regs); + __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); else __do_kernel_fault(mm, addr, fsr, regs); } @@ -389,7 +391,7 @@ retry: SEGV_ACCERR : SEGV_MAPERR; } - __do_user_fault(tsk, addr, fsr, sig, code, regs); + __do_user_fault(addr, fsr, sig, code, regs); return 0; no_context: @@ -553,9 +555,10 @@ do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs)) return; + pr_alert("8<--- cut here ---\n"); pr_alert("Unhandled fault: %s (0x%03x) at 0x%08lx\n", inf->name, fsr, addr); - show_pte(current->mm, addr); + show_pte(KERN_ALERT, current->mm, addr); arm_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, fsr, 0); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 749a5a6f6143..4920a206dce9 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -239,6 +239,22 @@ static void __init arm_initrd_init(void) #endif } +#ifdef CONFIG_CPU_ICACHE_MISMATCH_WORKAROUND +void check_cpu_icache_size(int cpuid) +{ + u32 size, ctr; + + asm("mrc p15, 0, %0, c0, c0, 1" : "=r" (ctr)); + + size = 1 << ((ctr & 0xf) + 2); + if (cpuid != 0 && icache_size != size) + pr_info("CPU%u: detected I-Cache line size mismatch, workaround enabled\n", + cpuid); + if (icache_size > size) + icache_size = size; +} +#endif + void __init arm_memblock_init(const struct machine_desc *mdesc) { /* Register the kernel text, kernel data and initrd with memblock. */ @@ -447,12 +463,6 @@ static void __init free_highpages(void) */ void __init mem_init(void) { -#ifdef CONFIG_HAVE_TCM - /* These pointers are filled in on TCM detection */ - extern u32 dtcm_end; - extern u32 itcm_end; -#endif - set_max_mapnr(pfn_to_page(max_pfn) - mem_map); /* this will put all unused low memory onto the freelists */ diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 6b045c6653ea..941356d95a67 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -8,6 +8,8 @@ /* the upper-most page table pointer */ extern pmd_t *top_pmd; +extern int icache_size; + /* * 0xffff8000 to 0xffffffff is reserved for any ARM architecture * specific hacks for copying pages efficiently, while 0xffff4000 diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 1aa2586fa597..d9a0038774a6 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -729,7 +729,7 @@ static void __init *early_alloc(unsigned long sz) static void *__init late_alloc(unsigned long sz) { - void *ptr = (void *)__get_free_pages(PGALLOC_GFP, get_order(sz)); + void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) BUG(); diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c index 0f5faf30d9bf..d546efad7e97 100644 --- a/arch/arm/mm/pageattr.c +++ b/arch/arm/mm/pageattr.c @@ -14,8 +14,7 @@ struct page_change_data { pgprot_t clear_mask; }; -static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = *ptep; diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 83741c31757d..c4e8006a1a8c 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -389,6 +389,11 @@ __ca12_errata: orr r10, r10, #1 << 24 @ set bit #24 mcr p15, 0, r10, c15, c0, 1 @ write diagnostic register #endif +#ifdef CONFIG_ARM_ERRATA_857271 + mrc p15, 0, r10, c15, c0, 1 @ read diagnostic register + orr r10, r10, #3 << 10 @ set bits #10 and #11 + mcr p15, 0, r10, c15, c0, 1 @ write diagnostic register +#endif b __errata_finish __ca17_errata: @@ -404,6 +409,11 @@ __ca17_errata: orrle r10, r10, #1 << 12 @ set bit #12 mcrle p15, 0, r10, c15, c0, 1 @ write diagnostic register #endif +#ifdef CONFIG_ARM_ERRATA_857272 + mrc p15, 0, r10, c15, c0, 1 @ read diagnostic register + orr r10, r10, #3 << 10 @ set bits #10 and #11 + mcr p15, 0, r10, c15, c0, 1 @ write diagnostic register +#endif b __errata_finish __v7_pj4b_setup: diff --git a/arch/arm/mm/ptdump_debugfs.c b/arch/arm/mm/ptdump_debugfs.c index be8d87be4b93..598b636615a2 100644 --- a/arch/arm/mm/ptdump_debugfs.c +++ b/arch/arm/mm/ptdump_debugfs.c @@ -24,11 +24,7 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -int ptdump_debugfs_register(struct ptdump_info *info, const char *name) +void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { - struct dentry *pe; - - pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); - return pe ? 0 : -ENOMEM; - + debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); } diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index adff54c312bf..97dc386e3cb8 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -733,7 +733,8 @@ static inline void emit_a32_alu_r64(const bool is64, const s8 dst[], /* ALU operation */ emit_alu_r(rd[1], rs, true, false, op, ctx); - emit_a32_mov_i(rd[0], 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(rd[0], 0, ctx); } arm_bpf_put_reg64(dst, rd, ctx); @@ -755,8 +756,9 @@ static inline void emit_a32_mov_r64(const bool is64, const s8 dst[], struct jit_ctx *ctx) { if (!is64) { emit_a32_mov_r(dst_lo, src_lo, ctx); - /* Zero out high 4 bytes */ - emit_a32_mov_i(dst_hi, 0, ctx); + if (!ctx->prog->aux->verifier_zext) + /* Zero out high 4 bytes */ + emit_a32_mov_i(dst_hi, 0, ctx); } else if (__LINUX_ARM_ARCH__ < 6 && ctx->cpu_architecture < CPU_ARCH_ARMv5TE) { /* complete 8 byte move */ @@ -1057,17 +1059,20 @@ static inline void emit_ldx_r(const s8 dst[], const s8 src, case BPF_B: /* Load a Byte */ emit(ARM_LDRB_I(rd[1], rm, off), ctx); - emit_a32_mov_i(rd[0], 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(rd[0], 0, ctx); break; case BPF_H: /* Load a HalfWord */ emit(ARM_LDRH_I(rd[1], rm, off), ctx); - emit_a32_mov_i(rd[0], 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(rd[0], 0, ctx); break; case BPF_W: /* Load a Word */ emit(ARM_LDR_I(rd[1], rm, off), ctx); - emit_a32_mov_i(rd[0], 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(rd[0], 0, ctx); break; case BPF_DW: /* Load a Double Word */ @@ -1356,6 +1361,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_ALU64 | BPF_MOV | BPF_X: switch (BPF_SRC(code)) { case BPF_X: + if (imm == 1) { + /* Special mov32 for zext */ + emit_a32_mov_i(dst_hi, 0, ctx); + break; + } emit_a32_mov_r64(is64, dst, src, ctx); break; case BPF_K: @@ -1435,7 +1445,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) } emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code)); arm_bpf_put_reg32(dst_lo, rd_lo, ctx); - emit_a32_mov_i(dst_hi, 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(dst_hi, 0, ctx); break; case BPF_ALU64 | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_X: @@ -1450,7 +1461,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) return -EINVAL; if (imm) emit_a32_alu_i(dst_lo, imm, ctx, BPF_OP(code)); - emit_a32_mov_i(dst_hi, 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(dst_hi, 0, ctx); break; /* dst = dst << imm */ case BPF_ALU64 | BPF_LSH | BPF_K: @@ -1485,7 +1497,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* dst = ~dst */ case BPF_ALU | BPF_NEG: emit_a32_alu_i(dst_lo, 0, ctx, BPF_OP(code)); - emit_a32_mov_i(dst_hi, 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_a32_mov_i(dst_hi, 0, ctx); break; /* dst = ~dst (64 bit) */ case BPF_ALU64 | BPF_NEG: @@ -1541,11 +1554,13 @@ emit_bswap_uxt: #else /* ARMv6+ */ emit(ARM_UXTH(rd[1], rd[1]), ctx); #endif - emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); + if (!ctx->prog->aux->verifier_zext) + emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); break; case 32: /* zero-extend 32 bits into 64 bits */ - emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); + if (!ctx->prog->aux->verifier_zext) + emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); break; case 64: /* nop */ @@ -1835,6 +1850,11 @@ void bpf_jit_compile(struct bpf_prog *prog) /* Nothing to do here. We support Internal BPF. */ } +bool bpf_jit_needs_zext(void) +{ + return true; +} + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_prog *tmp, *orig_prog = prog; diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index aaf479a9e92d..6da7dc4d79cc 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -447,3 +447,5 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 1f5ec9741e6d..ca85df247775 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -12,8 +12,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING ldflags-$(CONFIG_CPU_ENDIAN_BE8) := --be8 ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ - -z max-page-size=4096 -z common-page-size=4096 \ - -nostdlib -shared $(ldflags-y) \ + -z max-page-size=4096 -nostdlib -shared $(ldflags-y) \ $(call ld-option, --hash-style=sysv) \ $(call ld-option, --build-id) \ -T diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 697ea0510729..a36ff61321ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -26,6 +26,7 @@ config ARM64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX @@ -107,6 +108,8 @@ config ARM64 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select GENERIC_GETTIMEOFDAY + select GENERIC_COMPAT_VDSO if (!CPU_BIG_ENDIAN && COMPAT) select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_PCI @@ -140,6 +143,7 @@ config ARM64 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER @@ -160,6 +164,7 @@ config ARM64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_GENERIC_VDSO select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING @@ -260,10 +265,8 @@ config GENERIC_CALIBRATE_DELAY def_bool y config ZONE_DMA32 - def_bool y - -config HAVE_GENERIC_GUP - def_bool y + bool "Support DMA32 zone" if EXPERT + default y config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y @@ -933,7 +936,6 @@ config PARAVIRT config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" select PARAVIRT - default n help Select this option to enable fine granularity task steal time accounting. Time spent executing other tasks in parallel with @@ -994,7 +996,7 @@ config CRASH_DUMP reserved region and then later executed after a crash by kdump/kexec. - For more details see Documentation/kdump/kdump.txt + For more details see Documentation/kdump/kdump.rst config XEN_DOM0 def_bool y @@ -1418,12 +1420,27 @@ config ARM64_SVE KVM in the same kernel image. config ARM64_MODULE_PLTS - bool + bool "Use PLTs to allow module memory to spill over into vmalloc area" + depends on MODULES select HAVE_MOD_ARCH_SPECIFIC + help + Allocate PLTs when loading modules so that jumps and calls whose + targets are too far away for their relative offsets to be encoded + in the instructions themselves can be bounced via veneers in the + module's PLT. This allows modules to be allocated in the generic + vmalloc area after the dedicated module memory area has been + exhausted. + + When running with address space randomization (KASLR), the module + region itself may be too far away for ordinary relative jumps and + calls, and so in that case, module PLTs are required and cannot be + disabled. + + Specific errata workaround(s) might also force module PLTs to be + enabled (ARM64_ERRATUM_843419). config ARM64_PSEUDO_NMI bool "Support for NMI-like interrupts" - depends on BROKEN # 1556553607-46531-1-git-send-email-julien.thierry@arm.com select CONFIG_ARM_GIC_V3 help Adds support for mimicking Non-Maskable Interrupts through the use of @@ -1436,6 +1453,17 @@ config ARM64_PSEUDO_NMI If unsure, say N +if ARM64_PSEUDO_NMI +config ARM64_DEBUG_PRIORITY_MASKING + bool "Debug interrupt priority masking" + help + This adds runtime checks to functions enabling/disabling + interrupts when using priority masking. The additional checks verify + the validity of ICC_PMR_EL1 when calling concerned functions. + + If unsure, say N +endif + config RELOCATABLE bool help diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index e9d2e578cbe6..e3d3fd0a4268 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -49,10 +49,26 @@ $(warning Detected assembler with broken .inst; disassembly will be unreliable) endif endif -KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) $(brokengasinst) +ifeq ($(CONFIG_GENERIC_COMPAT_VDSO), y) + CROSS_COMPILE_COMPAT ?= $(CONFIG_CROSS_COMPILE_COMPAT_VDSO:"%"=%) + + ifeq ($(CONFIG_CC_IS_CLANG), y) + $(warning CROSS_COMPILE_COMPAT is clang, the compat vDSO will not be built) + else ifeq ($(CROSS_COMPILE_COMPAT),) + $(warning CROSS_COMPILE_COMPAT not defined or empty, the compat vDSO will not be built) + else ifeq ($(shell which $(CROSS_COMPILE_COMPAT)gcc 2> /dev/null),) + $(error $(CROSS_COMPILE_COMPAT)gcc not found, check CROSS_COMPILE_COMPAT) + else + export CROSS_COMPILE_COMPAT + export CONFIG_COMPAT_VDSO := y + compat_vdso := -DCONFIG_COMPAT_VDSO=1 + endif +endif + +KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) $(brokengasinst) $(compat_vdso) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(call cc-disable-warning, psabi) -KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) +KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) $(compat_vdso) KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) @@ -164,6 +180,9 @@ ifeq ($(KBUILD_EXTMOD),) prepare: vdso_prepare vdso_prepare: prepare0 $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso include/generated/vdso-offsets.h + $(if $(CONFIG_COMPAT_VDSO),$(Q)$(MAKE) \ + $(build)=arch/arm64/kernel/vdso32 \ + include/generated/vdso32-offsets.h) endif define archhelp diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi index 470dcfd9de91..4b0f674df849 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi @@ -539,6 +539,14 @@ interrupts = <16 4>; }; + ocram-ecc@ff8cc000 { + compatible = "altr,socfpga-s10-ocram-ecc", + "altr,socfpga-a10-ocram-ecc"; + reg = <0xff8cc000 0x100>; + altr,ecc-parent = <&ocram>; + interrupts = <1 4>; + }; + usb0-ecc@ff8c4000 { compatible = "altr,socfpga-s10-usb-ecc", "altr,socfpga-usb-ecc"; diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts index 84f9f5902e74..66e4ffb4e929 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dts @@ -56,6 +56,17 @@ clock-frequency = <25000000>; }; }; + + eccmgr { + sdmmca-ecc@ff8c8c00 { + compatible = "altr,socfpga-s10-sdmmc-ecc", + "altr,socfpga-sdmmc-ecc"; + reg = <0xff8c8c00 0x100>; + altr,ecc-parent = <&mmc>; + interrupts = <14 4>, + <15 4>; + }; + }; }; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index bf7f845447ed..22a1c74dddf3 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -431,6 +431,12 @@ compatible = "fsl,enetc"; reg = <0x000100 0 0 0 0>; }; + ethernet@0,4 { + compatible = "fsl,enetc-ptp"; + reg = <0x000400 0 0 0 0>; + clocks = <&clockgen 4 0>; + little-endian; + }; }; }; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index 661137ffa319..dacd8cf03a7f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -609,6 +609,14 @@ <GIC_SPI 209 IRQ_TYPE_LEVEL_HIGH>; }; + ptp-timer@8b95000 { + compatible = "fsl,dpaa2-ptp"; + reg = <0x0 0x8b95000 0x0 0x100>; + clocks = <&clockgen 4 0>; + little-endian; + fsl,extts-fifo; + }; + cluster1_core0_watchdog: wdt@c000000 { compatible = "arm,sp805-wdt", "arm,primecell"; reg = <0x0 0xc000000 0x0 0x1000>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index d7e78dcd153d..3ace91945b72 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -321,6 +321,14 @@ }; }; + ptp-timer@8b95000 { + compatible = "fsl,dpaa2-ptp"; + reg = <0x0 0x8b95000 0x0 0x100>; + clocks = <&clockgen 4 1>; + little-endian; + fsl,extts-fifo; + }; + fsl_mc: fsl-mc@80c000000 { compatible = "fsl,qoriq-mc"; reg = <0x00000008 0x0c000000 0 0x40>, /* MC portal base */ diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi index 125a8cc2c5b3..e6fdba39453c 100644 --- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi @@ -848,6 +848,14 @@ dma-coherent; }; + ptp-timer@8b95000 { + compatible = "fsl,dpaa2-ptp"; + reg = <0x0 0x8b95000 0x0 0x100>; + clocks = <&clockgen 4 1>; + little-endian; + fsl,extts-fifo; + }; + fsl_mc: fsl-mc@80c000000 { compatible = "fsl,qoriq-mc"; reg = <0x00000008 0x0c000000 0 0x40>, diff --git a/arch/arm64/boot/dts/mediatek/mt7622.dtsi b/arch/arm64/boot/dts/mediatek/mt7622.dtsi index 4b1f5ae710eb..d1e13d340e26 100644 --- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi @@ -929,7 +929,8 @@ sgmiisys: sgmiisys@1b128000 { compatible = "mediatek,mt7622-sgmiisys", "syscon"; - reg = <0 0x1b128000 0 0x1000>; + reg = <0 0x1b128000 0 0x3000>; #clock-cells = <1>; + mediatek,physpeed = "2500"; }; }; diff --git a/arch/arm64/boot/dts/qcom/msm8998-mtp.dtsi b/arch/arm64/boot/dts/qcom/msm8998-mtp.dtsi index f09f3e03f708..108667ce4f31 100644 --- a/arch/arm64/boot/dts/qcom/msm8998-mtp.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8998-mtp.dtsi @@ -27,6 +27,23 @@ status = "okay"; }; +&pm8005_lsid1 { + pm8005-regulators { + compatible = "qcom,pm8005-regulators"; + + vdd_s1-supply = <&vph_pwr>; + + pm8005_s1: s1 { /* VDD_GFX supply */ + regulator-min-microvolt = <524000>; + regulator-max-microvolt = <1100000>; + regulator-enable-ramp-delay = <500>; + + /* hack until we rig up the gpu consumer */ + regulator-always-on; + }; + }; +}; + &qusb2phy { status = "okay"; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 6bca5b082ea4..dd827e64e5fe 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -68,6 +68,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_XEN=y CONFIG_COMPAT=y +CONFIG_RANDOMIZE_BASE=y CONFIG_HIBERNATION=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y CONFIG_ARM_CPUIDLE=y diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index 3ebfaec97e27..00bd2885feaa 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S @@ -15,6 +15,8 @@ .arch armv8-a+crypto xtsmask .req v16 + cbciv .req v16 + vctr .req v16 .macro xts_reload_mask, tmp .endm @@ -49,7 +51,7 @@ load_round_keys \rounds, \temp .endm - .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 + .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4 aes\de \i0\().16b, \k\().16b aes\mc \i0\().16b, \i0\().16b .ifnb \i1 @@ -60,27 +62,34 @@ aes\mc \i2\().16b, \i2\().16b aes\de \i3\().16b, \k\().16b aes\mc \i3\().16b, \i3\().16b + .ifnb \i4 + aes\de \i4\().16b, \k\().16b + aes\mc \i4\().16b, \i4\().16b + .endif .endif .endif .endm - /* up to 4 interleaved encryption rounds with the same round key */ - .macro round_Nx, enc, k, i0, i1, i2, i3 + /* up to 5 interleaved encryption rounds with the same round key */ + .macro round_Nx, enc, k, i0, i1, i2, i3, i4 .ifc \enc, e - do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 + do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4 .else - do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 + do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4 .endif .endm - /* up to 4 interleaved final rounds */ - .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 + /* up to 5 interleaved final rounds */ + .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4 aes\de \i0\().16b, \k\().16b .ifnb \i1 aes\de \i1\().16b, \k\().16b .ifnb \i3 aes\de \i2\().16b, \k\().16b aes\de \i3\().16b, \k\().16b + .ifnb \i4 + aes\de \i4\().16b, \k\().16b + .endif .endif .endif eor \i0\().16b, \i0\().16b, \k2\().16b @@ -89,47 +98,52 @@ .ifnb \i3 eor \i2\().16b, \i2\().16b, \k2\().16b eor \i3\().16b, \i3\().16b, \k2\().16b + .ifnb \i4 + eor \i4\().16b, \i4\().16b, \k2\().16b + .endif .endif .endif .endm - /* up to 4 interleaved blocks */ - .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 + /* up to 5 interleaved blocks */ + .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4 cmp \rounds, #12 blo 2222f /* 128 bits */ beq 1111f /* 192 bits */ - round_Nx \enc, v17, \i0, \i1, \i2, \i3 - round_Nx \enc, v18, \i0, \i1, \i2, \i3 -1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 - round_Nx \enc, v20, \i0, \i1, \i2, \i3 + round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4 + round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4 +1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 + round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 - round_Nx \enc, \key, \i0, \i1, \i2, \i3 + round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4 .endr - fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 + fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4 .endm .macro encrypt_block, in, rounds, t0, t1, t2 do_block_Nx e, \rounds, \in .endm - .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 - do_block_Nx e, \rounds, \i0, \i1 - .endm - .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 .endm - .macro decrypt_block, in, rounds, t0, t1, t2 - do_block_Nx d, \rounds, \in + .macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4 .endm - .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 - do_block_Nx d, \rounds, \i0, \i1 + .macro decrypt_block, in, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \in .endm .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 .endm + .macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4 + .endm + +#define MAX_STRIDE 5 + #include "aes-modes.S" diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 2883def14be5..324039b72094 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -10,6 +10,18 @@ .text .align 4 +#ifndef MAX_STRIDE +#define MAX_STRIDE 4 +#endif + +#if MAX_STRIDE == 4 +#define ST4(x...) x +#define ST5(x...) +#else +#define ST4(x...) +#define ST5(x...) x +#endif + aes_encrypt_block4x: encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 ret @@ -20,6 +32,18 @@ aes_decrypt_block4x: ret ENDPROC(aes_decrypt_block4x) +#if MAX_STRIDE == 5 +aes_encrypt_block5x: + encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 + ret +ENDPROC(aes_encrypt_block5x) + +aes_decrypt_block5x: + decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 + ret +ENDPROC(aes_decrypt_block5x) +#endif + /* * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, * int blocks) @@ -34,14 +58,17 @@ AES_ENTRY(aes_ecb_encrypt) enc_prepare w3, x2, x5 .LecbencloopNx: - subs w4, w4, #4 + subs w4, w4, #MAX_STRIDE bmi .Lecbenc1x ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ - bl aes_encrypt_block4x +ST4( bl aes_encrypt_block4x ) +ST5( ld1 {v4.16b}, [x1], #16 ) +ST5( bl aes_encrypt_block5x ) st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) b .LecbencloopNx .Lecbenc1x: - adds w4, w4, #4 + adds w4, w4, #MAX_STRIDE beq .Lecbencout .Lecbencloop: ld1 {v0.16b}, [x1], #16 /* get next pt block */ @@ -62,14 +89,17 @@ AES_ENTRY(aes_ecb_decrypt) dec_prepare w3, x2, x5 .LecbdecloopNx: - subs w4, w4, #4 + subs w4, w4, #MAX_STRIDE bmi .Lecbdec1x ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ - bl aes_decrypt_block4x +ST4( bl aes_decrypt_block4x ) +ST5( ld1 {v4.16b}, [x1], #16 ) +ST5( bl aes_decrypt_block5x ) st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) b .LecbdecloopNx .Lecbdec1x: - adds w4, w4, #4 + adds w4, w4, #MAX_STRIDE beq .Lecbdecout .Lecbdecloop: ld1 {v0.16b}, [x1], #16 /* get next ct block */ @@ -129,39 +159,56 @@ AES_ENTRY(aes_cbc_decrypt) stp x29, x30, [sp, #-16]! mov x29, sp - ld1 {v7.16b}, [x5] /* get iv */ + ld1 {cbciv.16b}, [x5] /* get iv */ dec_prepare w3, x2, x6 .LcbcdecloopNx: - subs w4, w4, #4 + subs w4, w4, #MAX_STRIDE bmi .Lcbcdec1x ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ +#if MAX_STRIDE == 5 + ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ + mov v5.16b, v0.16b + mov v6.16b, v1.16b + mov v7.16b, v2.16b + bl aes_decrypt_block5x + sub x1, x1, #32 + eor v0.16b, v0.16b, cbciv.16b + eor v1.16b, v1.16b, v5.16b + ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v4.16b, v4.16b, v5.16b +#else mov v4.16b, v0.16b mov v5.16b, v1.16b mov v6.16b, v2.16b bl aes_decrypt_block4x sub x1, x1, #16 - eor v0.16b, v0.16b, v7.16b + eor v0.16b, v0.16b, cbciv.16b eor v1.16b, v1.16b, v4.16b - ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ eor v2.16b, v2.16b, v5.16b eor v3.16b, v3.16b, v6.16b +#endif st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) b .LcbcdecloopNx .Lcbcdec1x: - adds w4, w4, #4 + adds w4, w4, #MAX_STRIDE beq .Lcbcdecout .Lcbcdecloop: ld1 {v1.16b}, [x1], #16 /* get next ct block */ mov v0.16b, v1.16b /* ...and copy to v0 */ decrypt_block v0, w3, x2, x6, w7 - eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ - mov v7.16b, v1.16b /* ct is next iv */ + eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ + mov cbciv.16b, v1.16b /* ct is next iv */ st1 {v0.16b}, [x0], #16 subs w4, w4, #1 bne .Lcbcdecloop .Lcbcdecout: - st1 {v7.16b}, [x5] /* return iv */ + st1 {cbciv.16b}, [x5] /* return iv */ ldp x29, x30, [sp], #16 ret AES_ENDPROC(aes_cbc_decrypt) @@ -255,51 +302,60 @@ AES_ENTRY(aes_ctr_encrypt) mov x29, sp enc_prepare w3, x2, x6 - ld1 {v4.16b}, [x5] + ld1 {vctr.16b}, [x5] - umov x6, v4.d[1] /* keep swabbed ctr in reg */ + umov x6, vctr.d[1] /* keep swabbed ctr in reg */ rev x6, x6 cmn w6, w4 /* 32 bit overflow? */ bcs .Lctrloop .LctrloopNx: - subs w4, w4, #4 + subs w4, w4, #MAX_STRIDE bmi .Lctr1x add w7, w6, #1 - mov v0.16b, v4.16b + mov v0.16b, vctr.16b add w8, w6, #2 - mov v1.16b, v4.16b + mov v1.16b, vctr.16b + add w9, w6, #3 + mov v2.16b, vctr.16b add w9, w6, #3 - mov v2.16b, v4.16b rev w7, w7 - mov v3.16b, v4.16b + mov v3.16b, vctr.16b rev w8, w8 +ST5( mov v4.16b, vctr.16b ) mov v1.s[3], w7 rev w9, w9 +ST5( add w10, w6, #4 ) mov v2.s[3], w8 +ST5( rev w10, w10 ) mov v3.s[3], w9 +ST5( mov v4.s[3], w10 ) ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ - bl aes_encrypt_block4x +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b - ld1 {v5.16b}, [x1], #16 /* get 1 input block */ +ST4( ld1 {v5.16b}, [x1], #16 ) eor v1.16b, v6.16b, v1.16b +ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) eor v2.16b, v7.16b, v2.16b eor v3.16b, v5.16b, v3.16b +ST5( eor v4.16b, v6.16b, v4.16b ) st1 {v0.16b-v3.16b}, [x0], #64 - add x6, x6, #4 +ST5( st1 {v4.16b}, [x0], #16 ) + add x6, x6, #MAX_STRIDE rev x7, x6 - ins v4.d[1], x7 + ins vctr.d[1], x7 cbz w4, .Lctrout b .LctrloopNx .Lctr1x: - adds w4, w4, #4 + adds w4, w4, #MAX_STRIDE beq .Lctrout .Lctrloop: - mov v0.16b, v4.16b + mov v0.16b, vctr.16b encrypt_block v0, w3, x2, x8, w7 adds x6, x6, #1 /* increment BE ctr */ rev x7, x6 - ins v4.d[1], x7 + ins vctr.d[1], x7 bcs .Lctrcarry /* overflow? */ .Lctrcarrydone: @@ -311,7 +367,7 @@ AES_ENTRY(aes_ctr_encrypt) bne .Lctrloop .Lctrout: - st1 {v4.16b}, [x5] /* return next CTR value */ + st1 {vctr.16b}, [x5] /* return next CTR value */ ldp x29, x30, [sp], #16 ret @@ -320,11 +376,11 @@ AES_ENTRY(aes_ctr_encrypt) b .Lctrout .Lctrcarry: - umov x7, v4.d[0] /* load upper word of ctr */ + umov x7, vctr.d[0] /* load upper word of ctr */ rev x7, x7 /* ... to handle the carry */ add x7, x7, #1 rev x7, x7 - ins v4.d[0], x7 + ins vctr.d[0], x7 b .Lctrcarrydone AES_ENDPROC(aes_ctr_encrypt) diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index d261331747f2..2bebccc73869 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -12,6 +12,8 @@ #define AES_ENDPROC(func) ENDPROC(neon_ ## func) xtsmask .req v7 + cbciv .req v7 + vctr .req v4 .macro xts_reload_mask, tmp xts_load_mask \tmp @@ -114,26 +116,9 @@ /* * Interleaved versions: functionally equivalent to the - * ones above, but applied to 2 or 4 AES states in parallel. + * ones above, but applied to AES states in parallel. */ - .macro sub_bytes_2x, in0, in1 - sub v8.16b, \in0\().16b, v15.16b - tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b - sub v9.16b, \in1\().16b, v15.16b - tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b - sub v10.16b, v8.16b, v15.16b - tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b - sub v11.16b, v9.16b, v15.16b - tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b - sub v8.16b, v10.16b, v15.16b - tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b - sub v9.16b, v11.16b, v15.16b - tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b - tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b - tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b - .endm - .macro sub_bytes_4x, in0, in1, in2, in3 sub v8.16b, \in0\().16b, v15.16b tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b @@ -212,25 +197,6 @@ eor \in1\().16b, \in1\().16b, v11.16b .endm - .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i - ld1 {v15.4s}, [\rk] - add \rkp, \rk, #16 - mov \i, \rounds -1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ - eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ - movi v15.16b, #0x40 - tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ - tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ - sub_bytes_2x \in0, \in1 - subs \i, \i, #1 - ld1 {v15.4s}, [\rkp], #16 - beq 2222f - mix_columns_2x \in0, \in1, \enc - b 1111b -2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ - eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ - .endm - .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 @@ -257,14 +223,6 @@ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ .endm - .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i - do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i - .endm - - .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i - do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i - .endm - .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i .endm diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 82029cda2e77..1495d2b18518 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -60,7 +60,7 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, } static int chacha_neon_stream_xor(struct skcipher_request *req, - struct chacha_ctx *ctx, u8 *iv) + const struct chacha_ctx *ctx, const u8 *iv) { struct skcipher_walk walk; u32 state[16]; diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index ecb0f67e5998..bdc1b6d7aff7 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -52,7 +52,7 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct sha1_ce_state *sctx = shash_desc_ctx(desc); - bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); + bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE) && len; if (!crypto_simd_usable()) return crypto_sha1_finup(desc, data, len, out); diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 955c3c2d3f5a..604a01a4ede6 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -57,7 +57,7 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); - bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); + bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE) && len; if (!crypto_simd_usable()) { if (len) diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index ada0bc480a1b..b263e239cb59 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -38,6 +38,9 @@ (!(entry) || (entry)->header.length < ACPI_MADT_GICC_MIN_LENGTH || \ (unsigned long)(entry) + (entry)->header.length > (end)) +#define ACPI_MADT_GICC_SPE (ACPI_OFFSET(struct acpi_madt_generic_interrupt, \ + spe_interrupt) + sizeof(u16)) + /* Basic configuration for ACPI */ #ifdef CONFIG_ACPI pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 2247908e55d6..79155a8cfe7c 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -152,7 +152,9 @@ static inline bool gic_prio_masking_enabled(void) static inline void gic_pmr_mask_irqs(void) { - BUILD_BUG_ON(GICD_INT_DEF_PRI <= GIC_PRIO_IRQOFF); + BUILD_BUG_ON(GICD_INT_DEF_PRI < (GIC_PRIO_IRQOFF | + GIC_PRIO_PSR_I_SET)); + BUILD_BUG_ON(GICD_INT_DEF_PRI >= GIC_PRIO_IRQON); gic_write_pmr(GIC_PRIO_IRQOFF); } diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index 6756178c27db..7ae54d7d333a 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -9,6 +9,7 @@ #define __ASM_ARCH_TIMER_H #include <asm/barrier.h> +#include <asm/hwcap.h> #include <asm/sysreg.h> #include <linux/bug.h> @@ -229,4 +230,16 @@ static inline int arch_timer_arch_init(void) return 0; } +static inline void arch_timer_set_evtstrm_feature(void) +{ + cpu_set_named_feature(EVTSTRM); +#ifdef CONFIG_COMPAT + compat_elf_hwcap |= COMPAT_HWCAP_EVTSTRM; +#endif +} + +static inline bool arch_timer_have_evtstrm_feature(void) +{ + return cpu_have_named_feature(EVTSTRM); +} #endif diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 570d195a184d..e3a15c751b13 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -96,7 +96,11 @@ * RAS Error Synchronization barrier */ .macro esb +#ifdef CONFIG_ARM64_RAS_EXTN hint #16 +#else + nop +#endif .endm /* diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h index 23c378606aed..c8c850bc3dfb 100644 --- a/arch/arm64/include/asm/atomic_ll_sc.h +++ b/arch/arm64/include/asm/atomic_ll_sc.h @@ -122,9 +122,9 @@ ATOMIC_OPS(xor, eor) #define ATOMIC64_OP(op, asm_op) \ __LL_SC_INLINE void \ -__LL_SC_PREFIX(arch_atomic64_##op(long i, atomic64_t *v)) \ +__LL_SC_PREFIX(arch_atomic64_##op(s64 i, atomic64_t *v)) \ { \ - long result; \ + s64 result; \ unsigned long tmp; \ \ asm volatile("// atomic64_" #op "\n" \ @@ -139,10 +139,10 @@ __LL_SC_PREFIX(arch_atomic64_##op(long i, atomic64_t *v)) \ __LL_SC_EXPORT(arch_atomic64_##op); #define ATOMIC64_OP_RETURN(name, mb, acq, rel, cl, op, asm_op) \ -__LL_SC_INLINE long \ -__LL_SC_PREFIX(arch_atomic64_##op##_return##name(long i, atomic64_t *v))\ +__LL_SC_INLINE s64 \ +__LL_SC_PREFIX(arch_atomic64_##op##_return##name(s64 i, atomic64_t *v))\ { \ - long result; \ + s64 result; \ unsigned long tmp; \ \ asm volatile("// atomic64_" #op "_return" #name "\n" \ @@ -161,10 +161,10 @@ __LL_SC_PREFIX(arch_atomic64_##op##_return##name(long i, atomic64_t *v))\ __LL_SC_EXPORT(arch_atomic64_##op##_return##name); #define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op) \ -__LL_SC_INLINE long \ -__LL_SC_PREFIX(arch_atomic64_fetch_##op##name(long i, atomic64_t *v)) \ +__LL_SC_INLINE s64 \ +__LL_SC_PREFIX(arch_atomic64_fetch_##op##name(s64 i, atomic64_t *v)) \ { \ - long result, val; \ + s64 result, val; \ unsigned long tmp; \ \ asm volatile("// atomic64_fetch_" #op #name "\n" \ @@ -214,10 +214,10 @@ ATOMIC64_OPS(xor, eor) #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP -__LL_SC_INLINE long +__LL_SC_INLINE s64 __LL_SC_PREFIX(arch_atomic64_dec_if_positive(atomic64_t *v)) { - long result; + s64 result; unsigned long tmp; asm volatile("// atomic64_dec_if_positive\n" diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index 45e030d54332..69acb1c19a15 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -213,9 +213,9 @@ ATOMIC_FETCH_OP_SUB( , al, "memory") #define __LL_SC_ATOMIC64(op) __LL_SC_CALL(arch_atomic64_##op) #define ATOMIC64_OP(op, asm_op) \ -static inline void arch_atomic64_##op(long i, atomic64_t *v) \ +static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ { \ - register long x0 asm ("x0") = i; \ + register s64 x0 asm ("x0") = i; \ register atomic64_t *x1 asm ("x1") = v; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN(__LL_SC_ATOMIC64(op), \ @@ -233,9 +233,9 @@ ATOMIC64_OP(add, stadd) #undef ATOMIC64_OP #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...) \ -static inline long arch_atomic64_fetch_##op##name(long i, atomic64_t *v)\ +static inline s64 arch_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \ { \ - register long x0 asm ("x0") = i; \ + register s64 x0 asm ("x0") = i; \ register atomic64_t *x1 asm ("x1") = v; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ @@ -265,9 +265,9 @@ ATOMIC64_FETCH_OPS(add, ldadd) #undef ATOMIC64_FETCH_OPS #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...) \ -static inline long arch_atomic64_add_return##name(long i, atomic64_t *v)\ +static inline s64 arch_atomic64_add_return##name(s64 i, atomic64_t *v) \ { \ - register long x0 asm ("x0") = i; \ + register s64 x0 asm ("x0") = i; \ register atomic64_t *x1 asm ("x1") = v; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ @@ -291,9 +291,9 @@ ATOMIC64_OP_ADD_RETURN( , al, "memory") #undef ATOMIC64_OP_ADD_RETURN -static inline void arch_atomic64_and(long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { - register long x0 asm ("x0") = i; + register s64 x0 asm ("x0") = i; register atomic64_t *x1 asm ("x1") = v; asm volatile(ARM64_LSE_ATOMIC_INSN( @@ -309,9 +309,9 @@ static inline void arch_atomic64_and(long i, atomic64_t *v) } #define ATOMIC64_FETCH_OP_AND(name, mb, cl...) \ -static inline long arch_atomic64_fetch_and##name(long i, atomic64_t *v) \ +static inline s64 arch_atomic64_fetch_and##name(s64 i, atomic64_t *v) \ { \ - register long x0 asm ("x0") = i; \ + register s64 x0 asm ("x0") = i; \ register atomic64_t *x1 asm ("x1") = v; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ @@ -335,9 +335,9 @@ ATOMIC64_FETCH_OP_AND( , al, "memory") #undef ATOMIC64_FETCH_OP_AND -static inline void arch_atomic64_sub(long i, atomic64_t *v) +static inline void arch_atomic64_sub(s64 i, atomic64_t *v) { - register long x0 asm ("x0") = i; + register s64 x0 asm ("x0") = i; register atomic64_t *x1 asm ("x1") = v; asm volatile(ARM64_LSE_ATOMIC_INSN( @@ -353,9 +353,9 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) } #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...) \ -static inline long arch_atomic64_sub_return##name(long i, atomic64_t *v)\ +static inline s64 arch_atomic64_sub_return##name(s64 i, atomic64_t *v) \ { \ - register long x0 asm ("x0") = i; \ + register s64 x0 asm ("x0") = i; \ register atomic64_t *x1 asm ("x1") = v; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ @@ -381,9 +381,9 @@ ATOMIC64_OP_SUB_RETURN( , al, "memory") #undef ATOMIC64_OP_SUB_RETURN #define ATOMIC64_FETCH_OP_SUB(name, mb, cl...) \ -static inline long arch_atomic64_fetch_sub##name(long i, atomic64_t *v) \ +static inline s64 arch_atomic64_fetch_sub##name(s64 i, atomic64_t *v) \ { \ - register long x0 asm ("x0") = i; \ + register s64 x0 asm ("x0") = i; \ register atomic64_t *x1 asm ("x1") = v; \ \ asm volatile(ARM64_LSE_ATOMIC_INSN( \ @@ -407,7 +407,7 @@ ATOMIC64_FETCH_OP_SUB( , al, "memory") #undef ATOMIC64_FETCH_OP_SUB -static inline long arch_atomic64_dec_if_positive(atomic64_t *v) +static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { register long x0 asm ("x0") = (long)v; diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index a05db636981a..64eeaa41e7ca 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -80,12 +80,15 @@ static inline u32 cache_type_cwg(void) #define __read_mostly __attribute__((__section__(".data..read_mostly"))) -static inline int cache_line_size(void) +static inline int cache_line_size_of_cpu(void) { u32 cwg = cache_type_cwg(); + return cwg ? 4 << cwg : ARCH_DMA_MINALIGN; } +int cache_line_size(void); + /* * Read the effective value of CTR_EL0. * diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 1fe4467442aa..665c78e0665a 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -176,4 +176,7 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) int set_memory_valid(unsigned long addr, int numpages, int enable); +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + #endif diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 373799b7982f..407e2bf23676 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -614,6 +614,18 @@ static inline bool system_uses_irq_prio_masking(void) cpus_have_const_cap(ARM64_HAS_IRQ_PRIO_MASKING); } +static inline bool system_has_prio_mask_debugging(void) +{ + return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) && + system_uses_irq_prio_masking(); +} + +#define ARM64_BP_HARDEN_UNKNOWN -1 +#define ARM64_BP_HARDEN_WA_NEEDED 0 +#define ARM64_BP_HARDEN_NOT_REQUIRED 1 + +int get_spectre_v2_workaround_state(void); + #define ARM64_SSBD_UNKNOWN -1 #define ARM64_SSBD_FORCE_DISABLE 0 #define ARM64_SSBD_KERNEL 1 diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index 6dd8a8723525..987926ed535e 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -7,6 +7,7 @@ #include <linux/irqflags.h> +#include <asm/arch_gicv3.h> #include <asm/cpufeature.h> #define DAIF_PROCCTX 0 @@ -16,11 +17,20 @@ /* mask/save/unmask/restore all exceptions, including interrupts. */ static inline void local_daif_mask(void) { + WARN_ON(system_has_prio_mask_debugging() && + (read_sysreg_s(SYS_ICC_PMR_EL1) == (GIC_PRIO_IRQOFF | + GIC_PRIO_PSR_I_SET))); + asm volatile( "msr daifset, #0xf // local_daif_mask\n" : : : "memory"); + + /* Don't really care for a dsb here, we don't intend to enable IRQs */ + if (system_uses_irq_prio_masking()) + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + trace_hardirqs_off(); } @@ -32,7 +42,7 @@ static inline unsigned long local_daif_save(void) if (system_uses_irq_prio_masking()) { /* If IRQs are masked with PMR, reflect it in the flags */ - if (read_sysreg_s(SYS_ICC_PMR_EL1) <= GIC_PRIO_IRQOFF) + if (read_sysreg_s(SYS_ICC_PMR_EL1) != GIC_PRIO_IRQON) flags |= PSR_I_BIT; } @@ -45,39 +55,50 @@ static inline void local_daif_restore(unsigned long flags) { bool irq_disabled = flags & PSR_I_BIT; + WARN_ON(system_has_prio_mask_debugging() && + !(read_sysreg(daif) & PSR_I_BIT)); + if (!irq_disabled) { trace_hardirqs_on(); - if (system_uses_irq_prio_masking()) - arch_local_irq_enable(); - } else if (!(flags & PSR_A_BIT)) { - /* - * If interrupts are disabled but we can take - * asynchronous errors, we can take NMIs - */ if (system_uses_irq_prio_masking()) { - flags &= ~PSR_I_BIT; + gic_write_pmr(GIC_PRIO_IRQON); + dsb(sy); + } + } else if (system_uses_irq_prio_masking()) { + u64 pmr; + + if (!(flags & PSR_A_BIT)) { /* - * There has been concern that the write to daif - * might be reordered before this write to PMR. - * From the ARM ARM DDI 0487D.a, section D1.7.1 - * "Accessing PSTATE fields": - * Writes to the PSTATE fields have side-effects on - * various aspects of the PE operation. All of these - * side-effects are guaranteed: - * - Not to be visible to earlier instructions in - * the execution stream. - * - To be visible to later instructions in the - * execution stream - * - * Also, writes to PMR are self-synchronizing, so no - * interrupts with a lower priority than PMR is signaled - * to the PE after the write. - * - * So we don't need additional synchronization here. + * If interrupts are disabled but we can take + * asynchronous errors, we can take NMIs */ - arch_local_irq_disable(); + flags &= ~PSR_I_BIT; + pmr = GIC_PRIO_IRQOFF; + } else { + pmr = GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET; } + + /* + * There has been concern that the write to daif + * might be reordered before this write to PMR. + * From the ARM ARM DDI 0487D.a, section D1.7.1 + * "Accessing PSTATE fields": + * Writes to the PSTATE fields have side-effects on + * various aspects of the PE operation. All of these + * side-effects are guaranteed: + * - Not to be visible to earlier instructions in + * the execution stream. + * - To be visible to later instructions in the + * execution stream + * + * Also, writes to PMR are self-synchronizing, so no + * interrupts with a lower priority than PMR is signaled + * to the PE after the write. + * + * So we don't need additional synchronization here. + */ + gic_write_pmr(pmr); } write_sysreg(flags, daif); diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index c9e9a6978e73..8e79ce9c3f5c 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -83,7 +83,7 @@ static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) * guaranteed to cover the kernel Image. * * Since the EFI stub is part of the kernel Image, we can relax the - * usual requirements in Documentation/arm64/booting.txt, which still + * usual requirements in Documentation/arm64/booting.rst, which still * apply to other bootloaders, and are required for some kernel * configurations. */ diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 325d9515c0f8..3c7037c6ba9b 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -202,7 +202,21 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; ({ \ set_thread_flag(TIF_32BIT); \ }) +#ifdef CONFIG_GENERIC_COMPAT_VDSO +#define COMPAT_ARCH_DLINFO \ +do { \ + /* \ + * Note that we use Elf64_Off instead of elf_addr_t because \ + * elf_addr_t in compat is defined as Elf32_Addr and casting \ + * current->mm->context.vdso to it triggers a cast warning of \ + * cast from pointer to integer of different size. \ + */ \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (Elf64_Off)current->mm->context.vdso); \ +} while (0) +#else #define COMPAT_ARCH_DLINFO +#endif extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages \ diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 897029c8e9b5..b6a2c352f4c3 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -37,8 +37,6 @@ struct task_struct; extern void fpsimd_save_state(struct user_fpsimd_state *state); extern void fpsimd_load_state(struct user_fpsimd_state *state); -extern void fpsimd_save(void); - extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); @@ -52,8 +50,7 @@ extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state, void *sve_state, unsigned int sve_vl); extern void fpsimd_flush_task_state(struct task_struct *target); -extern void fpsimd_flush_cpu_state(void); -extern void sve_flush_cpu_state(void); +extern void fpsimd_save_and_flush_cpu_state(void); /* Maximum VL that SVE VL-agnostic software can transparently support */ #define SVE_VL_ARCH_MAX 0x100 diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index e5d9420cd258..3d2f2472a36c 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -84,6 +84,8 @@ #define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) #define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) #define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) +#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) +#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/image.h b/arch/arm64/include/asm/image.h index e2c27a2278e9..c2b13213c720 100644 --- a/arch/arm64/include/asm/image.h +++ b/arch/arm64/include/asm/image.h @@ -27,7 +27,7 @@ /* * struct arm64_image_header - arm64 kernel image header - * See Documentation/arm64/booting.txt for details + * See Documentation/arm64/booting.rst for details * * @code0: Executable code, or * @mz_header alternatively used for part of MZ header diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 66853fde60f9..7872f260c9ee 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -29,6 +29,12 @@ */ static inline void arch_local_irq_enable(void) { + if (system_has_prio_mask_debugging()) { + u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ON_ONCE(pmr != GIC_PRIO_IRQON && pmr != GIC_PRIO_IRQOFF); + } + asm volatile(ALTERNATIVE( "msr daifclr, #2 // arch_local_irq_enable\n" "nop", @@ -42,6 +48,12 @@ static inline void arch_local_irq_enable(void) static inline void arch_local_irq_disable(void) { + if (system_has_prio_mask_debugging()) { + u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ON_ONCE(pmr != GIC_PRIO_IRQON && pmr != GIC_PRIO_IRQOFF); + } + asm volatile(ALTERNATIVE( "msr daifset, #2 // arch_local_irq_disable", __msr_s(SYS_ICC_PMR_EL1, "%0"), @@ -56,43 +68,46 @@ static inline void arch_local_irq_disable(void) */ static inline unsigned long arch_local_save_flags(void) { - unsigned long daif_bits; unsigned long flags; - daif_bits = read_sysreg(daif); - - /* - * The asm is logically equivalent to: - * - * if (system_uses_irq_prio_masking()) - * flags = (daif_bits & PSR_I_BIT) ? - * GIC_PRIO_IRQOFF : - * read_sysreg_s(SYS_ICC_PMR_EL1); - * else - * flags = daif_bits; - */ asm volatile(ALTERNATIVE( - "mov %0, %1\n" - "nop\n" - "nop", - __mrs_s("%0", SYS_ICC_PMR_EL1) - "ands %1, %1, " __stringify(PSR_I_BIT) "\n" - "csel %0, %0, %2, eq", - ARM64_HAS_IRQ_PRIO_MASKING) - : "=&r" (flags), "+r" (daif_bits) - : "r" ((unsigned long) GIC_PRIO_IRQOFF) + "mrs %0, daif", + __mrs_s("%0", SYS_ICC_PMR_EL1), + ARM64_HAS_IRQ_PRIO_MASKING) + : "=&r" (flags) + : : "memory"); return flags; } +static inline int arch_irqs_disabled_flags(unsigned long flags) +{ + int res; + + asm volatile(ALTERNATIVE( + "and %w0, %w1, #" __stringify(PSR_I_BIT), + "eor %w0, %w1, #" __stringify(GIC_PRIO_IRQON), + ARM64_HAS_IRQ_PRIO_MASKING) + : "=&r" (res) + : "r" ((int) flags) + : "memory"); + + return res; +} + static inline unsigned long arch_local_irq_save(void) { unsigned long flags; flags = arch_local_save_flags(); - arch_local_irq_disable(); + /* + * There are too many states with IRQs disabled, just keep the current + * state if interrupts are already disabled/masked. + */ + if (!arch_irqs_disabled_flags(flags)) + arch_local_irq_disable(); return flags; } @@ -108,26 +123,10 @@ static inline void arch_local_irq_restore(unsigned long flags) __msr_s(SYS_ICC_PMR_EL1, "%0") "dsb sy", ARM64_HAS_IRQ_PRIO_MASKING) - : "+r" (flags) : + : "r" (flags) : "memory"); } -static inline int arch_irqs_disabled_flags(unsigned long flags) -{ - int res; - - asm volatile(ALTERNATIVE( - "and %w0, %w1, #" __stringify(PSR_I_BIT) "\n" - "nop", - "cmp %w1, #" __stringify(GIC_PRIO_IRQOFF) "\n" - "cset %w0, ls", - ARM64_HAS_IRQ_PRIO_MASKING) - : "=&r" (res) - : "r" ((int) flags) - : "memory"); - - return res; -} #endif #endif diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2ca437ef59fa..44a243754c1b 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -30,6 +30,12 @@ {ARM_EXCEPTION_TRAP, "TRAP" }, \ {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" } +/* + * Size of the HYP vectors preamble. kvm_patch_vector_branch() generates code + * that jumps over this. + */ +#define KVM_VECTOR_PREAMBLE (2 * AARCH64_INSN_SIZE) + #ifndef __ASSEMBLY__ #include <linux/mm.h> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 034dadec7168..d69c1efc63e7 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -126,7 +126,7 @@ static inline unsigned long *__vcpu_elr_el1(const struct kvm_vcpu *vcpu) static inline unsigned long vcpu_read_elr_el1(const struct kvm_vcpu *vcpu) { if (vcpu->arch.sysregs_loaded_on_cpu) - return read_sysreg_el1(elr); + return read_sysreg_el1(SYS_ELR); else return *__vcpu_elr_el1(vcpu); } @@ -134,7 +134,7 @@ static inline unsigned long vcpu_read_elr_el1(const struct kvm_vcpu *vcpu) static inline void vcpu_write_elr_el1(const struct kvm_vcpu *vcpu, unsigned long v) { if (vcpu->arch.sysregs_loaded_on_cpu) - write_sysreg_el1(v, elr); + write_sysreg_el1(v, SYS_ELR); else *__vcpu_elr_el1(vcpu) = v; } @@ -186,7 +186,7 @@ static inline unsigned long vcpu_read_spsr(const struct kvm_vcpu *vcpu) return vcpu_read_spsr32(vcpu); if (vcpu->arch.sysregs_loaded_on_cpu) - return read_sysreg_el1(spsr); + return read_sysreg_el1(SYS_SPSR); else return vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1]; } @@ -199,7 +199,7 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) } if (vcpu->arch.sysregs_loaded_on_cpu) - write_sysreg_el1(v, spsr); + write_sysreg_el1(v, SYS_SPSR); else vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v; } @@ -353,6 +353,20 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; } +static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG; +} + +static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu, + bool flag) +{ + if (flag) + vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG; + else + vcpu->arch.workaround_flags &= ~VCPU_WORKAROUND_2_FLAG; +} + static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { @@ -451,13 +465,13 @@ static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) */ static inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu) { - *vcpu_pc(vcpu) = read_sysreg_el2(elr); - vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(SYS_SPSR); kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); - write_sysreg_el2(*vcpu_pc(vcpu), elr); + write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, SYS_SPSR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); } #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c328191aa202..f656169db8c3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -19,12 +19,12 @@ #include <asm/arch_gicv3.h> #include <asm/barrier.h> #include <asm/cpufeature.h> +#include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> -#include <asm/smp_plat.h> #include <asm/thread_info.h> #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -484,11 +484,10 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data); -static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt, - int cpu) +static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { /* The host's MPIDR is immutable, so let's set it up at boot time */ - cpu_ctxt->sys_regs[MPIDR_EL1] = cpu_logical_map(cpu); + cpu_ctxt->sys_regs[MPIDR_EL1] = read_cpuid_mpidr(); } void __kvm_enable_ssbs(void); @@ -597,11 +596,12 @@ static inline void kvm_arm_vhe_guest_enter(void) * will not signal the CPU of interrupts of lower priority, and the * only way to get out will be via guest exceptions. * Naturally, we want to avoid this. + * + * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a + * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU. */ - if (system_uses_irq_prio_masking()) { - gic_write_pmr(GIC_PRIO_IRQON); + if (system_uses_irq_prio_masking()) dsb(sy); - } } static inline void kvm_arm_vhe_guest_exit(void) @@ -620,9 +620,21 @@ static inline void kvm_arm_vhe_guest_exit(void) isb(); } -static inline bool kvm_arm_harden_branch_predictor(void) +#define KVM_BP_HARDEN_UNKNOWN -1 +#define KVM_BP_HARDEN_WA_NEEDED 0 +#define KVM_BP_HARDEN_NOT_REQUIRED 1 + +static inline int kvm_arm_harden_branch_predictor(void) { - return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR); + switch (get_spectre_v2_workaround_state()) { + case ARM64_BP_HARDEN_WA_NEEDED: + return KVM_BP_HARDEN_WA_NEEDED; + case ARM64_BP_HARDEN_NOT_REQUIRED: + return KVM_BP_HARDEN_NOT_REQUIRED; + case ARM64_BP_HARDEN_UNKNOWN: + default: + return KVM_BP_HARDEN_UNKNOWN; + } } #define KVM_SSBD_UNKNOWN -1 diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 286f7e7e1be4..86825aa20852 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -18,7 +18,7 @@ #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ - asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##nvh),\ + asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \ __mrs_s("%0", r##vh), \ ARM64_HAS_VIRT_HOST_EXTN) \ : "=r" (reg)); \ @@ -28,7 +28,7 @@ #define write_sysreg_elx(v,r,nvh,vh) \ do { \ u64 __val = (u64)(v); \ - asm volatile(ALTERNATIVE("msr " __stringify(r##nvh) ", %x0",\ + asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"), \ __msr_s(r##vh, "%x0"), \ ARM64_HAS_VIRT_HOST_EXTN) \ : : "rZ" (__val)); \ @@ -37,55 +37,15 @@ /* * Unified accessors for registers that have a different encoding * between VHE and non-VHE. They must be specified without their "ELx" - * encoding. + * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h. */ -#define read_sysreg_el2(r) \ - ({ \ - u64 reg; \ - asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##_EL2),\ - "mrs %0, " __stringify(r##_EL1),\ - ARM64_HAS_VIRT_HOST_EXTN) \ - : "=r" (reg)); \ - reg; \ - }) - -#define write_sysreg_el2(v,r) \ - do { \ - u64 __val = (u64)(v); \ - asm volatile(ALTERNATIVE("msr " __stringify(r##_EL2) ", %x0",\ - "msr " __stringify(r##_EL1) ", %x0",\ - ARM64_HAS_VIRT_HOST_EXTN) \ - : : "rZ" (__val)); \ - } while (0) #define read_sysreg_el0(r) read_sysreg_elx(r, _EL0, _EL02) #define write_sysreg_el0(v,r) write_sysreg_elx(v, r, _EL0, _EL02) #define read_sysreg_el1(r) read_sysreg_elx(r, _EL1, _EL12) #define write_sysreg_el1(v,r) write_sysreg_elx(v, r, _EL1, _EL12) - -/* The VHE specific system registers and their encoding */ -#define sctlr_EL12 sys_reg(3, 5, 1, 0, 0) -#define cpacr_EL12 sys_reg(3, 5, 1, 0, 2) -#define ttbr0_EL12 sys_reg(3, 5, 2, 0, 0) -#define ttbr1_EL12 sys_reg(3, 5, 2, 0, 1) -#define tcr_EL12 sys_reg(3, 5, 2, 0, 2) -#define afsr0_EL12 sys_reg(3, 5, 5, 1, 0) -#define afsr1_EL12 sys_reg(3, 5, 5, 1, 1) -#define esr_EL12 sys_reg(3, 5, 5, 2, 0) -#define far_EL12 sys_reg(3, 5, 6, 0, 0) -#define mair_EL12 sys_reg(3, 5, 10, 2, 0) -#define amair_EL12 sys_reg(3, 5, 10, 3, 0) -#define vbar_EL12 sys_reg(3, 5, 12, 0, 0) -#define contextidr_EL12 sys_reg(3, 5, 13, 0, 1) -#define cntkctl_EL12 sys_reg(3, 5, 14, 1, 0) -#define cntp_tval_EL02 sys_reg(3, 5, 14, 2, 0) -#define cntp_ctl_EL02 sys_reg(3, 5, 14, 2, 1) -#define cntp_cval_EL02 sys_reg(3, 5, 14, 2, 2) -#define cntv_tval_EL02 sys_reg(3, 5, 14, 3, 0) -#define cntv_ctl_EL02 sys_reg(3, 5, 14, 3, 1) -#define cntv_cval_EL02 sys_reg(3, 5, 14, 3, 2) -#define spsr_EL12 sys_reg(3, 5, 4, 0, 0) -#define elr_EL12 sys_reg(3, 5, 4, 0, 1) +#define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1) +#define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1) /** * hyp_alternate_select - Generates patchable code sequences that are diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index cdced518378d..14d0bc44d451 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -13,18 +13,23 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + #define check_pgt_cache() do { } while (0) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { + gfp_t gfp = GFP_PGTABLE_USER; struct page *page; - page = alloc_page(PGALLOC_GFP); + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + + page = alloc_page(gfp); if (!page) return NULL; if (!pgtable_pmd_page_ctor(page)) { @@ -61,7 +66,7 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)__get_free_page(PGALLOC_GFP); + return (pud_t *)__get_free_page(GFP_PGTABLE_USER); } static inline void pud_free(struct mm_struct *mm, pud_t *pudp) @@ -89,42 +94,6 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(PGALLOC_GFP); -} - -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(PGALLOC_GFP, 0); - if (!pte) - return NULL; - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; -} - -/* - * Free a PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *ptep) -{ - if (ptep) - free_page((unsigned long)ptep); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t ptep, pmdval_t prot) { diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 30e5e67749e5..db92950bb1a0 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -115,7 +115,6 @@ * Level 2 descriptor (PMD). */ #define PMD_TYPE_MASK (_AT(pmdval_t, 3) << 0) -#define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) #define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1) @@ -142,8 +141,8 @@ /* * Level 3 descriptor (PTE). */ +#define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_TYPE_MASK (_AT(pteval_t, 3) << 0) -#define PTE_TYPE_FAULT (_AT(pteval_t, 0) << 0) #define PTE_TYPE_PAGE (_AT(pteval_t, 3) << 0) #define PTE_TABLE_BIT (_AT(pteval_t, 1) << 1) #define PTE_USER (_AT(pteval_t, 1) << 6) /* AP[1] */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index c81583be034b..f318258a14be 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -13,7 +13,6 @@ /* * Software defined PTE bits definition. */ -#define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fca26759081a..3052381baaeb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -235,29 +235,42 @@ extern void __sync_icache_dcache(pte_t pteval); * * PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY) */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + +static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep, + pte_t pte) { pte_t old_pte; - if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) - __sync_icache_dcache(pte); + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + old_pte = READ_ONCE(*ptep); + + if (!pte_valid(old_pte) || !pte_valid(pte)) + return; + if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1) + return; /* - * If the existing pte is valid, check for potential race with - * hardware updates of the pte (ptep_set_access_flags safely changes - * valid ptes without going through an invalid entry). + * Check for potential race with hardware updates of the pte + * (ptep_set_access_flags safely changes valid ptes without going + * through an invalid entry). */ - old_pte = READ_ONCE(*ptep); - if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(old_pte) && pte_valid(pte) && - (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) { - VM_WARN_ONCE(!pte_young(pte), - "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", - __func__, pte_val(old_pte), pte_val(pte)); - VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), - "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", - __func__, pte_val(old_pte), pte_val(pte)); - } + VM_WARN_ONCE(!pte_young(pte), + "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", + __func__, pte_val(old_pte), pte_val(pte)); + VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), + "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", + __func__, pte_val(old_pte), pte_val(pte)); +} + +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte); + + __check_racy_pte_update(mm, ptep, pte); set_pte(ptep, pte); } @@ -324,9 +337,14 @@ static inline pmd_t pte_pmd(pte_t pte) return __pmd(pte_val(pte)); } -static inline pgprot_t mk_sect_prot(pgprot_t prot) +static inline pgprot_t mk_pud_sect_prot(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT); +} + +static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) { - return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT); + return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); } #ifdef CONFIG_NUMA_BALANCING diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index dad858b6adc6..81693244f58d 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -24,9 +24,15 @@ * means masking more IRQs (or at least that the same IRQs remain masked). * * To mask interrupts, we clear the most significant bit of PMR. + * + * Some code sections either automatically switch back to PSR.I or explicitly + * require to not use priority masking. If bit GIC_PRIO_PSR_I_SET is included + * in the the priority mask, it indicates that PSR.I should be set and + * interrupt disabling temporarily does not rely on IRQ priorities. */ -#define GIC_PRIO_IRQON 0xf0 -#define GIC_PRIO_IRQOFF (GIC_PRIO_IRQON & ~0x80) +#define GIC_PRIO_IRQON 0xc0 +#define GIC_PRIO_IRQOFF (GIC_PRIO_IRQON & ~0x80) +#define GIC_PRIO_PSR_I_SET (1 << 4) /* Additional SPSR bits not exposed in the UABI */ #define PSR_IL_BIT (1 << 20) diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index 0418c67f2b8b..bd43d1cf724b 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -9,6 +9,52 @@ #ifdef CONFIG_COMPAT #include <linux/compat.h> +struct compat_sigcontext { + /* We always set these two fields to 0 */ + compat_ulong_t trap_no; + compat_ulong_t error_code; + + compat_ulong_t oldmask; + compat_ulong_t arm_r0; + compat_ulong_t arm_r1; + compat_ulong_t arm_r2; + compat_ulong_t arm_r3; + compat_ulong_t arm_r4; + compat_ulong_t arm_r5; + compat_ulong_t arm_r6; + compat_ulong_t arm_r7; + compat_ulong_t arm_r8; + compat_ulong_t arm_r9; + compat_ulong_t arm_r10; + compat_ulong_t arm_fp; + compat_ulong_t arm_ip; + compat_ulong_t arm_sp; + compat_ulong_t arm_lr; + compat_ulong_t arm_pc; + compat_ulong_t arm_cpsr; + compat_ulong_t fault_address; +}; + +struct compat_ucontext { + compat_ulong_t uc_flags; + compat_uptr_t uc_link; + compat_stack_t uc_stack; + struct compat_sigcontext uc_mcontext; + compat_sigset_t uc_sigmask; + int __unused[32 - (sizeof(compat_sigset_t) / sizeof(int))]; + compat_ulong_t uc_regspace[128] __attribute__((__aligned__(8))); +}; + +struct compat_sigframe { + struct compat_ucontext uc; + compat_ulong_t retcode[2]; +}; + +struct compat_rt_sigframe { + struct compat_siginfo info; + struct compat_sigframe sig; +}; + int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h index 7e245b9e03a5..7434844036d3 100644 --- a/arch/arm64/include/asm/simd.h +++ b/arch/arm64/include/asm/simd.h @@ -12,9 +12,9 @@ #include <linux/preempt.h> #include <linux/types.h> -#ifdef CONFIG_KERNEL_MODE_NEON +DECLARE_PER_CPU(bool, fpsimd_context_busy); -DECLARE_PER_CPU(bool, kernel_neon_busy); +#ifdef CONFIG_KERNEL_MODE_NEON /* * may_use_simd - whether it is allowable at this time to issue SIMD @@ -26,15 +26,15 @@ DECLARE_PER_CPU(bool, kernel_neon_busy); static __must_check inline bool may_use_simd(void) { /* - * kernel_neon_busy is only set while preemption is disabled, + * fpsimd_context_busy is only set while preemption is disabled, * and is clear whenever preemption is enabled. Since - * this_cpu_read() is atomic w.r.t. preemption, kernel_neon_busy + * this_cpu_read() is atomic w.r.t. preemption, fpsimd_context_busy * cannot change under our feet -- if it's set we cannot be * migrated, and if it's clear we cannot be migrated to a CPU * where it is set. */ return !in_irq() && !irqs_disabled() && !in_nmi() && - !this_cpu_read(kernel_neon_busy); + !this_cpu_read(fpsimd_context_busy); } #else /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index cd7f7ce1a56a..a7522fca1105 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -191,6 +191,9 @@ #define SYS_APGAKEYLO_EL1 sys_reg(3, 0, 2, 3, 0) #define SYS_APGAKEYHI_EL1 sys_reg(3, 0, 2, 3, 1) +#define SYS_SPSR_EL1 sys_reg(3, 0, 4, 0, 0) +#define SYS_ELR_EL1 sys_reg(3, 0, 4, 0, 1) + #define SYS_ICC_PMR_EL1 sys_reg(3, 0, 4, 6, 0) #define SYS_AFSR0_EL1 sys_reg(3, 0, 5, 1, 0) @@ -382,6 +385,9 @@ #define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) #define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) +#define SYS_CNTV_CTL_EL0 sys_reg(3, 3, 14, 3, 1) +#define SYS_CNTV_CVAL_EL0 sys_reg(3, 3, 14, 3, 2) + #define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) #define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) #define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) @@ -392,14 +398,17 @@ #define __TYPER_CRm(n) (0xc | (((n) >> 3) & 0x3)) #define SYS_PMEVTYPERn_EL0(n) sys_reg(3, 3, 14, __TYPER_CRm(n), __PMEV_op2(n)) -#define SYS_PMCCFILTR_EL0 sys_reg (3, 3, 14, 15, 7) +#define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7) #define SYS_ZCR_EL2 sys_reg(3, 4, 1, 2, 0) - #define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0) +#define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) +#define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1) #define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1) +#define SYS_ESR_EL2 sys_reg(3, 4, 5, 2, 0) #define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3) #define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0) +#define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0) #define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) #define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x) @@ -444,7 +453,29 @@ #define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7) /* VHE encodings for architectural EL0/1 system registers */ +#define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0) +#define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2) #define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) +#define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0) +#define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1) +#define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2) +#define SYS_SPSR_EL12 sys_reg(3, 5, 4, 0, 0) +#define SYS_ELR_EL12 sys_reg(3, 5, 4, 0, 1) +#define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0) +#define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1) +#define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0) +#define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0) +#define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0) +#define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0) +#define SYS_VBAR_EL12 sys_reg(3, 5, 12, 0, 0) +#define SYS_CONTEXTIDR_EL12 sys_reg(3, 5, 13, 0, 1) +#define SYS_CNTKCTL_EL12 sys_reg(3, 5, 14, 1, 0) +#define SYS_CNTP_TVAL_EL02 sys_reg(3, 5, 14, 2, 0) +#define SYS_CNTP_CTL_EL02 sys_reg(3, 5, 14, 2, 1) +#define SYS_CNTP_CVAL_EL02 sys_reg(3, 5, 14, 2, 2) +#define SYS_CNTV_TVAL_EL02 sys_reg(3, 5, 14, 3, 0) +#define SYS_CNTV_CTL_EL02 sys_reg(3, 5, 14, 3, 1) +#define SYS_CNTV_CVAL_EL02 sys_reg(3, 5, 14, 3, 2) /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_DSSBS (_BITUL(44)) @@ -549,6 +580,7 @@ /* id_aa64isar1 */ #define ID_AA64ISAR1_SB_SHIFT 36 +#define ID_AA64ISAR1_FRINTTS_SHIFT 32 #define ID_AA64ISAR1_GPI_SHIFT 28 #define ID_AA64ISAR1_GPA_SHIFT 24 #define ID_AA64ISAR1_LRCPC_SHIFT 20 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 2372e97db29c..180b34ec5965 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -65,6 +65,7 @@ void arch_release_task_struct(struct task_struct *tsk); * TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace * TIF_SYSCALL_AUDIT - syscall auditing * TIF_SECCOMP - syscall secure computing + * TIF_SYSCALL_EMU - syscall emulation active * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user @@ -80,6 +81,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_SYSCALL_AUDIT 9 #define TIF_SYSCALL_TRACEPOINT 10 #define TIF_SECCOMP 11 +#define TIF_SYSCALL_EMU 12 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 #define TIF_RESTORE_SIGMASK 20 @@ -98,6 +100,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) @@ -109,7 +112,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ - _TIF_NOHZ) + _TIF_NOHZ | _TIF_SYSCALL_EMU) #define INIT_THREAD_INFO(tsk) \ { \ diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index c9f8dd421c5f..2629a68b8724 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -22,8 +22,13 @@ #define __NR_compat_exit 1 #define __NR_compat_read 3 #define __NR_compat_write 4 +#define __NR_compat_gettimeofday 78 #define __NR_compat_sigreturn 119 #define __NR_compat_rt_sigreturn 173 +#define __NR_compat_clock_getres 247 +#define __NR_compat_clock_gettime 263 +#define __NR_compat_clock_gettime64 403 +#define __NR_compat_clock_getres_time64 406 /* * The following SVCs are ARM private. @@ -33,10 +38,11 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 434 +#define __NR_compat_syscalls 436 #endif #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #ifndef __COMPAT_SYSCALL_NR #include <uapi/asm/unistd.h> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index aa995920bd34..94ab29cf4f00 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -875,6 +875,10 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig) __SYSCALL(__NR_fsmount, sys_fsmount) #define __NR_fspick 433 __SYSCALL(__NR_fspick, sys_fspick) +#define __NR_pidfd_open 434 +__SYSCALL(__NR_pidfd_open, sys_pidfd_open) +#define __NR_clone3 435 +__SYSCALL(__NR_clone3, sys_clone3) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h index 1f94ec19903c..9c15e0a06301 100644 --- a/arch/arm64/include/asm/vdso.h +++ b/arch/arm64/include/asm/vdso.h @@ -17,6 +17,9 @@ #ifndef __ASSEMBLY__ #include <generated/vdso-offsets.h> +#ifdef CONFIG_COMPAT_VDSO +#include <generated/vdso32-offsets.h> +#endif #define VDSO_SYMBOL(base, name) \ ({ \ diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h new file mode 100644 index 000000000000..fb60a88b5ed4 --- /dev/null +++ b/arch/arm64/include/asm/vdso/compat_barrier.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 ARM Limited + */ +#ifndef __COMPAT_BARRIER_H +#define __COMPAT_BARRIER_H + +#ifndef __ASSEMBLY__ +/* + * Warning: This code is meant to be used with + * ENABLE_COMPAT_VDSO only. + */ +#ifndef ENABLE_COMPAT_VDSO +#error This header is meant to be used with ENABLE_COMPAT_VDSO only +#endif + +#ifdef dmb +#undef dmb +#endif + +#define dmb(option) __asm__ __volatile__ ("dmb " #option : : : "memory") + +#if __LINUX_ARM_ARCH__ >= 8 +#define aarch32_smp_mb() dmb(ish) +#define aarch32_smp_rmb() dmb(ishld) +#define aarch32_smp_wmb() dmb(ishst) +#else +#define aarch32_smp_mb() dmb(ish) +#define aarch32_smp_rmb() aarch32_smp_mb() +#define aarch32_smp_wmb() dmb(ishst) +#endif + + +#undef smp_mb +#undef smp_rmb +#undef smp_wmb + +#define smp_mb() aarch32_smp_mb() +#define smp_rmb() aarch32_smp_rmb() +#define smp_wmb() aarch32_smp_wmb() + +#endif /* !__ASSEMBLY__ */ + +#endif /* __COMPAT_BARRIER_H */ diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h new file mode 100644 index 000000000000..f4812777f5c5 --- /dev/null +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 ARM Limited + */ +#ifndef __ASM_VDSO_GETTIMEOFDAY_H +#define __ASM_VDSO_GETTIMEOFDAY_H + +#ifndef __ASSEMBLY__ + +#include <asm/unistd.h> +#include <uapi/linux/time.h> + +#include <asm/vdso/compat_barrier.h> + +#define __VDSO_USE_SYSCALL ULLONG_MAX + +#define VDSO_HAS_CLOCK_GETRES 1 + +static __always_inline +int gettimeofday_fallback(struct __kernel_old_timeval *_tv, + struct timezone *_tz) +{ + register struct timezone *tz asm("r1") = _tz; + register struct __kernel_old_timeval *tv asm("r0") = _tv; + register long ret asm ("r0"); + register long nr asm("r7") = __NR_compat_gettimeofday; + + asm volatile( + " swi #0\n" + : "=r" (ret) + : "r" (tv), "r" (tz), "r" (nr) + : "memory"); + + return ret; +} + +static __always_inline +long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + register struct __kernel_timespec *ts asm("r1") = _ts; + register clockid_t clkid asm("r0") = _clkid; + register long ret asm ("r0"); + register long nr asm("r7") = __NR_compat_clock_gettime64; + + asm volatile( + " swi #0\n" + : "=r" (ret) + : "r" (clkid), "r" (ts), "r" (nr) + : "memory"); + + return ret; +} + +static __always_inline +int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + register struct __kernel_timespec *ts asm("r1") = _ts; + register clockid_t clkid asm("r0") = _clkid; + register long ret asm ("r0"); + register long nr asm("r7") = __NR_compat_clock_getres_time64; + + /* The checks below are required for ABI consistency with arm */ + if ((_clkid >= MAX_CLOCKS) && (_ts == NULL)) + return -EINVAL; + + asm volatile( + " swi #0\n" + : "=r" (ret) + : "r" (clkid), "r" (ts), "r" (nr) + : "memory"); + + return ret; +} + +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +{ + u64 res; + + /* + * clock_mode == 0 implies that vDSO are enabled otherwise + * fallback on syscall. + */ + if (clock_mode) + return __VDSO_USE_SYSCALL; + + /* + * This isb() is required to prevent that the counter value + * is speculated. + */ + isb(); + asm volatile("mrrc p15, 1, %Q0, %R0, c14" : "=r" (res)); + /* + * This isb() is required to prevent that the seq lock is + * speculated. + */ + isb(); + + return res; +} + +static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +{ + const struct vdso_data *ret; + + /* + * This simply puts &_vdso_data into ret. The reason why we don't use + * `ret = _vdso_data` is that the compiler tends to optimise this in a + * very suboptimal way: instead of keeping &_vdso_data in a register, + * it goes through a relocation almost every time _vdso_data must be + * accessed (even in subfunctions). This is both time and space + * consuming: each relocation uses a word in the code section, and it + * has to be loaded at runtime. + * + * This trick hides the assignment from the compiler. Since it cannot + * track where the pointer comes from, it will only use one relocation + * where __arch_get_vdso_data() is called, and then keep the result in + * a register. + */ + asm volatile("mov %0, %1" : "=r"(ret) : "r"(_vdso_data)); + + return ret; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h new file mode 100644 index 000000000000..b08f476b72b4 --- /dev/null +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 ARM Limited + */ +#ifndef __ASM_VDSO_GETTIMEOFDAY_H +#define __ASM_VDSO_GETTIMEOFDAY_H + +#ifndef __ASSEMBLY__ + +#include <asm/unistd.h> +#include <uapi/linux/time.h> + +#define __VDSO_USE_SYSCALL ULLONG_MAX + +#define VDSO_HAS_CLOCK_GETRES 1 + +static __always_inline +int gettimeofday_fallback(struct __kernel_old_timeval *_tv, + struct timezone *_tz) +{ + register struct timezone *tz asm("x1") = _tz; + register struct __kernel_old_timeval *tv asm("x0") = _tv; + register long ret asm ("x0"); + register long nr asm("x8") = __NR_gettimeofday; + + asm volatile( + " svc #0\n" + : "=r" (ret) + : "r" (tv), "r" (tz), "r" (nr) + : "memory"); + + return ret; +} + +static __always_inline +long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + register struct __kernel_timespec *ts asm("x1") = _ts; + register clockid_t clkid asm("x0") = _clkid; + register long ret asm ("x0"); + register long nr asm("x8") = __NR_clock_gettime; + + asm volatile( + " svc #0\n" + : "=r" (ret) + : "r" (clkid), "r" (ts), "r" (nr) + : "memory"); + + return ret; +} + +static __always_inline +int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + register struct __kernel_timespec *ts asm("x1") = _ts; + register clockid_t clkid asm("x0") = _clkid; + register long ret asm ("x0"); + register long nr asm("x8") = __NR_clock_getres; + + asm volatile( + " svc #0\n" + : "=r" (ret) + : "r" (clkid), "r" (ts), "r" (nr) + : "memory"); + + return ret; +} + +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +{ + u64 res; + + /* + * clock_mode == 0 implies that vDSO are enabled otherwise + * fallback on syscall. + */ + if (clock_mode) + return __VDSO_USE_SYSCALL; + + /* + * This isb() is required to prevent that the counter value + * is speculated. + */ + isb(); + asm volatile("mrs %0, cntvct_el0" : "=r" (res) :: "memory"); + /* + * This isb() is required to prevent that the seq lock is + * speculated.# + */ + isb(); + + return res; +} + +static __always_inline +const struct vdso_data *__arch_get_vdso_data(void) +{ + return _vdso_data; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h new file mode 100644 index 000000000000..0c731bfc7c8c --- /dev/null +++ b/arch/arm64/include/asm/vdso/vsyscall.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_VSYSCALL_H +#define __ASM_VDSO_VSYSCALL_H + +#ifndef __ASSEMBLY__ + +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> + +#define VDSO_PRECISION_MASK ~(0xFF00ULL<<48) + +extern struct vdso_data *vdso_data; + +/* + * Update the vDSO data page to keep in sync with kernel timekeeping. + */ +static __always_inline +struct vdso_data *__arm64_get_k_vdso_data(void) +{ + return vdso_data; +} +#define __arch_get_k_vdso_data __arm64_get_k_vdso_data + +static __always_inline +int __arm64_get_clock_mode(struct timekeeper *tk) +{ + u32 use_syscall = !tk->tkr_mono.clock->archdata.vdso_direct; + + return use_syscall; +} +#define __arch_get_clock_mode __arm64_get_clock_mode + +static __always_inline +int __arm64_use_vsyscall(struct vdso_data *vdata) +{ + return !vdata[CS_HRES_COARSE].clock_mode; +} +#define __arch_use_vsyscall __arm64_use_vsyscall + +static __always_inline +void __arm64_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk) +{ + vdata[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK; + vdata[CS_RAW].mask = VDSO_PRECISION_MASK; +} +#define __arch_update_vsyscall __arm64_update_vsyscall + +/* The asm-generic header needs to be included after the definitions above */ +#include <asm-generic/vdso/vsyscall.h> + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 1a772b162191..a1e72886b30c 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -63,5 +63,7 @@ #define HWCAP2_SVEBITPERM (1 << 4) #define HWCAP2_SVESHA3 (1 << 5) #define HWCAP2_SVESM4 (1 << 6) +#define HWCAP2_FLAGM2 (1 << 7) +#define HWCAP2_FRINT (1 << 8) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index d819a3e8b552..9a507716ae2f 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -229,6 +229,16 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_FW_REG(r) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ KVM_REG_ARM_FW | ((r) & 0xffff)) #define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 KVM_REG_ARM_FW_REG(1) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL 2 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) /* SVE registers */ #define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index e932284993d4..7ed9294e2004 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -62,6 +62,9 @@ #define PSR_x 0x0000ff00 /* Extension */ #define PSR_c 0x000000ff /* Control */ +/* syscall emulation path in ptrace */ +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 3d448a0bb225..8b0ebce92427 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -146,7 +146,7 @@ struct sve_context { * vector length beyond its initial architectural limit of 2048 bits * (16 quadwords). * - * See linux/Documentation/arm64/sve.txt for a description of the VL/VQ + * See linux/Documentation/arm64/sve.rst for a description of the VL/VQ * terminology. */ #define SVE_VQ_BYTES __SVE_VQ_BYTES /* bytes per quadword */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 9e7dcb2c31c7..478491f07b4f 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -28,7 +28,10 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE $(call if_changed,objcopy) obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ - sigreturn32.o sys_compat.o + sys_compat.o +ifneq ($(CONFIG_COMPAT_VDSO), y) +obj-$(CONFIG_COMPAT) += sigreturn32.o +endif obj-$(CONFIG_KUSER_HELPERS) += kuser32.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o obj-$(CONFIG_MODULES) += module.o @@ -62,6 +65,7 @@ obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-y += vdso/ probes/ +obj-$(CONFIG_COMPAT_VDSO) += vdso32/ head-y := head.o extra-y += $(head-y) vmlinux.lds diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 2804330c95dc..3a58e9db5cfe 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -152,10 +152,14 @@ static int __init acpi_fadt_sanity_check(void) */ if (table->revision < 5 || (table->revision == 5 && fadt->minor_revision < 1)) { - pr_err("Unsupported FADT revision %d.%d, should be 5.1+\n", + pr_err(FW_BUG "Unsupported FADT revision %d.%d, should be 5.1+\n", table->revision, fadt->minor_revision); - ret = -EINVAL; - goto out; + + if (!fadt->arm_boot_flags) { + ret = -EINVAL; + goto out; + } + pr_err("FADT has ARM boot flags set, assuming 5.1\n"); } if (!(fadt->flags & ACPI_FADT_HW_REDUCED)) { diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 02f08768c298..214685760e1c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -18,9 +18,9 @@ #include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> +#include <asm/signal32.h> #include <asm/smp_plat.h> #include <asm/suspend.h> -#include <asm/vdso_datapage.h> #include <linux/kbuild.h> #include <linux/arm-smccc.h> @@ -66,6 +66,11 @@ int main(void) DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); +#ifdef CONFIG_COMPAT + DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); + DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); + BLANK(); +#endif DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); @@ -80,33 +85,6 @@ int main(void) BLANK(); DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); BLANK(); - DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); - DEFINE(CLOCK_REALTIME_RES, offsetof(struct vdso_data, hrtimer_res)); - DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); - DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); - DEFINE(CLOCK_COARSE_RES, LOW_RES_NSEC); - DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); - BLANK(); - DEFINE(VDSO_CS_CYCLE_LAST, offsetof(struct vdso_data, cs_cycle_last)); - DEFINE(VDSO_RAW_TIME_SEC, offsetof(struct vdso_data, raw_time_sec)); - DEFINE(VDSO_XTIME_CLK_SEC, offsetof(struct vdso_data, xtime_clock_sec)); - DEFINE(VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec)); - DEFINE(VDSO_XTIME_CRS_NSEC, offsetof(struct vdso_data, xtime_coarse_nsec)); - DEFINE(VDSO_WTM_CLK_SEC, offsetof(struct vdso_data, wtm_clock_sec)); - DEFINE(VDSO_TB_SEQ_COUNT, offsetof(struct vdso_data, tb_seq_count)); - DEFINE(VDSO_CS_MONO_MULT, offsetof(struct vdso_data, cs_mono_mult)); - DEFINE(VDSO_CS_SHIFT, offsetof(struct vdso_data, cs_shift)); - DEFINE(VDSO_TZ_MINWEST, offsetof(struct vdso_data, tz_minuteswest)); - DEFINE(VDSO_USE_SYSCALL, offsetof(struct vdso_data, use_syscall)); - BLANK(); - DEFINE(TVAL_TV_SEC, offsetof(struct timeval, tv_sec)); - DEFINE(TSPEC_TV_SEC, offsetof(struct timespec, tv_sec)); - BLANK(); - DEFINE(TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); - DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); - BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 880d79904d36..7fa6828bb488 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -17,6 +17,15 @@ #define CLIDR_CTYPE(clidr, level) \ (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) +int cache_line_size(void) +{ + if (coherency_max_size != 0) + return coherency_max_size; + + return cache_line_size_of_cpu(); +} +EXPORT_SYMBOL_GPL(cache_line_size); + static inline enum cache_type get_cache_type(int level) { u64 clidr; diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index ca11ff7bf55e..1e43ba5c79b7 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -554,6 +554,17 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) static bool __hardenbp_enab = true; static bool __spectrev2_safe = true; +int get_spectre_v2_workaround_state(void) +{ + if (__spectrev2_safe) + return ARM64_BP_HARDEN_NOT_REQUIRED; + + if (!__hardenbp_enab) + return ARM64_BP_HARDEN_UNKNOWN; + + return ARM64_BP_HARDEN_WA_NEEDED; +} + /* * List of CPUs that do not need any Spectre-v2 mitigation at all. */ @@ -854,13 +865,15 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { - if (__spectrev2_safe) + switch (get_spectre_v2_workaround_state()) { + case ARM64_BP_HARDEN_NOT_REQUIRED: return sprintf(buf, "Not affected\n"); - - if (__hardenbp_enab) + case ARM64_BP_HARDEN_WA_NEEDED: return sprintf(buf, "Mitigation: Branch predictor hardening\n"); - - return sprintf(buf, "Vulnerable\n"); + case ARM64_BP_HARDEN_UNKNOWN: + default: + return sprintf(buf, "Vulnerable\n"); + } } ssize_t cpu_show_spec_store_bypass(struct device *dev, diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index aabdabf52fdb..f29f36a65175 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1184,14 +1184,14 @@ static struct undef_hook ssbs_emulation_hook = { static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused) { static bool undef_hook_registered = false; - static DEFINE_SPINLOCK(hook_lock); + static DEFINE_RAW_SPINLOCK(hook_lock); - spin_lock(&hook_lock); + raw_spin_lock(&hook_lock); if (!undef_hook_registered) { register_undef_hook(&ssbs_emulation_hook); undef_hook_registered = true; } - spin_unlock(&hook_lock); + raw_spin_unlock(&hook_lock); if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); @@ -1618,6 +1618,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -1629,6 +1630,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 0593665fc7b4..876055e37352 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -82,6 +82,8 @@ static const char *const hwcap_str[] = { "svebitperm", "svesha3", "svesm4", + "flagm2", + "frint", NULL }; diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 3c33d0dd8e0e..d0cf596db82c 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -82,8 +82,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) return 0; } -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; pte_t pte = READ_ONCE(*ptep); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2df8d0a1d980..9cdc4592da3e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -247,6 +247,7 @@ alternative_else_nop_endif /* * Registers that may be useful after this macro is invoked: * + * x20 - ICC_PMR_EL1 * x21 - aborted SP * x22 - aborted PC * x23 - aborted PSTATE @@ -424,6 +425,38 @@ tsk .req x28 // current thread_info irq_stack_exit .endm +#ifdef CONFIG_ARM64_PSEUDO_NMI + /* + * Set res to 0 if irqs were unmasked in interrupted context. + * Otherwise set res to non-0 value. + */ + .macro test_irqs_unmasked res:req, pmr:req +alternative_if ARM64_HAS_IRQ_PRIO_MASKING + sub \res, \pmr, #GIC_PRIO_IRQON +alternative_else + mov \res, xzr +alternative_endif + .endm +#endif + + .macro gic_prio_kentry_setup, tmp:req +#ifdef CONFIG_ARM64_PSEUDO_NMI + alternative_if ARM64_HAS_IRQ_PRIO_MASKING + mov \tmp, #(GIC_PRIO_PSR_I_SET | GIC_PRIO_IRQON) + msr_s SYS_ICC_PMR_EL1, \tmp + alternative_else_nop_endif +#endif + .endm + + .macro gic_prio_irq_setup, pmr:req, tmp:req +#ifdef CONFIG_ARM64_PSEUDO_NMI + alternative_if ARM64_HAS_IRQ_PRIO_MASKING + orr \tmp, \pmr, #GIC_PRIO_PSR_I_SET + msr_s SYS_ICC_PMR_EL1, \tmp + alternative_else_nop_endif +#endif + .endm + .text /* @@ -602,6 +635,7 @@ el1_dbg: cmp x24, #ESR_ELx_EC_BRK64 // if BRK64 cinc x24, x24, eq // set bit '0' tbz x24, #0, el1_inv // EL1 only + gic_prio_kentry_setup tmp=x3 mrs x0, far_el1 mov x2, sp // struct pt_regs bl do_debug_exception @@ -619,20 +653,18 @@ ENDPROC(el1_sync) .align 6 el1_irq: kernel_entry 1 + gic_prio_irq_setup pmr=x20, tmp=x1 enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS + #ifdef CONFIG_ARM64_PSEUDO_NMI -alternative_if ARM64_HAS_IRQ_PRIO_MASKING - ldr x20, [sp, #S_PMR_SAVE] -alternative_else - mov x20, #GIC_PRIO_IRQON -alternative_endif - cmp x20, #GIC_PRIO_IRQOFF - /* Irqs were disabled, don't trace */ - b.ls 1f + test_irqs_unmasked res=x0, pmr=x20 + cbz x0, 1f + bl asm_nmi_enter +1: #endif + +#ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off -1: #endif irq_handler @@ -651,14 +683,23 @@ alternative_else_nop_endif bl preempt_schedule_irq // irq en/disable is done inside 1: #endif -#ifdef CONFIG_TRACE_IRQFLAGS + #ifdef CONFIG_ARM64_PSEUDO_NMI /* - * if IRQs were disabled when we received the interrupt, we have an NMI - * and we are not re-enabling interrupt upon eret. Skip tracing. + * When using IRQ priority masking, we can get spurious interrupts while + * PMR is set to GIC_PRIO_IRQOFF. An NMI might also have occurred in a + * section with interrupts disabled. Skip tracing in those cases. */ - cmp x20, #GIC_PRIO_IRQOFF - b.ls 1f + test_irqs_unmasked res=x0, pmr=x20 + cbz x0, 1f + bl asm_nmi_exit +1: +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_ARM64_PSEUDO_NMI + test_irqs_unmasked res=x0, pmr=x20 + cbnz x0, 1f #endif bl trace_hardirqs_on 1: @@ -776,6 +817,7 @@ el0_ia: * Instruction abort handling */ mrs x26, far_el1 + gic_prio_kentry_setup tmp=x0 enable_da_f #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off @@ -821,6 +863,7 @@ el0_sp_pc: * Stack or PC alignment exception handling */ mrs x26, far_el1 + gic_prio_kentry_setup tmp=x0 enable_da_f #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off @@ -855,11 +898,12 @@ el0_dbg: * Debug exception handling */ tbnz x24, #0, el0_inv // EL0 only + gic_prio_kentry_setup tmp=x3 mrs x0, far_el1 mov x1, x25 mov x2, sp bl do_debug_exception - enable_daif + enable_da_f ct_user_exit b ret_to_user el0_inv: @@ -876,7 +920,9 @@ ENDPROC(el0_sync) el0_irq: kernel_entry 0 el0_irq_naked: + gic_prio_irq_setup pmr=x20, tmp=x0 enable_da_f + #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif @@ -898,6 +944,7 @@ ENDPROC(el0_irq) el1_error: kernel_entry 1 mrs x1, esr_el1 + gic_prio_kentry_setup tmp=x2 enable_dbg mov x0, sp bl do_serror @@ -908,10 +955,11 @@ el0_error: kernel_entry 0 el0_error_naked: mrs x1, esr_el1 + gic_prio_kentry_setup tmp=x2 enable_dbg mov x0, sp bl do_serror - enable_daif + enable_da_f ct_user_exit b ret_to_user ENDPROC(el0_error) @@ -932,6 +980,7 @@ work_pending: */ ret_to_user: disable_daif + gic_prio_kentry_setup tmp=x3 ldr x1, [tsk, #TSK_TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending @@ -948,6 +997,7 @@ ENDPROC(ret_to_user) */ .align 6 el0_svc: + gic_prio_kentry_setup tmp=x1 mov x0, sp bl el0_svc_handler b ret_to_user diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 0cfcf5c237c5..eec4776ae5f0 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -82,7 +82,8 @@ * To prevent this from racing with the manipulation of the task's FPSIMD state * from task context and thereby corrupting the state, it is necessary to * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with local_bh_disable() unless softirqs are already masked. + * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to + * run but prevent them to use FPSIMD. * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_cpu field @@ -145,6 +146,56 @@ extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ +DEFINE_PER_CPU(bool, fpsimd_context_busy); +EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); + +static void __get_cpu_fpsimd_context(void) +{ + bool busy = __this_cpu_xchg(fpsimd_context_busy, true); + + WARN_ON(busy); +} + +/* + * Claim ownership of the CPU FPSIMD context for use by the calling context. + * + * The caller may freely manipulate the FPSIMD context metadata until + * put_cpu_fpsimd_context() is called. + * + * The double-underscore version must only be called if you know the task + * can't be preempted. + */ +static void get_cpu_fpsimd_context(void) +{ + preempt_disable(); + __get_cpu_fpsimd_context(); +} + +static void __put_cpu_fpsimd_context(void) +{ + bool busy = __this_cpu_xchg(fpsimd_context_busy, false); + + WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */ +} + +/* + * Release the CPU FPSIMD context. + * + * Must be called from a context in which get_cpu_fpsimd_context() was + * previously called, with no call to put_cpu_fpsimd_context() in the + * meantime. + */ +static void put_cpu_fpsimd_context(void) +{ + __put_cpu_fpsimd_context(); + preempt_enable(); +} + +static bool have_cpu_fpsimd_context(void) +{ + return !preemptible() && __this_cpu_read(fpsimd_context_busy); +} + /* * Call __sve_free() directly only if you know task can't be scheduled * or preempted. @@ -215,12 +266,10 @@ static void sve_free(struct task_struct *task) * This function should be called only when the FPSIMD/SVE state in * thread_struct is known to be up to date, when preparing to enter * userspace. - * - * Softirqs (and preemption) must be disabled. */ static void task_fpsimd_load(void) { - WARN_ON(!in_softirq() && !irqs_disabled()); + WARN_ON(!have_cpu_fpsimd_context()); if (system_supports_sve() && test_thread_flag(TIF_SVE)) sve_load_state(sve_pffr(¤t->thread), @@ -233,16 +282,14 @@ static void task_fpsimd_load(void) /* * Ensure FPSIMD/SVE storage in memory for the loaded context is up to * date with respect to the CPU registers. - * - * Softirqs (and preemption) must be disabled. */ -void fpsimd_save(void) +static void fpsimd_save(void) { struct fpsimd_last_state_struct const *last = this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ - WARN_ON(!in_softirq() && !irqs_disabled()); + WARN_ON(!have_cpu_fpsimd_context()); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { if (system_supports_sve() && test_thread_flag(TIF_SVE)) { @@ -364,7 +411,8 @@ static __uint128_t arm64_cpu_to_le128(__uint128_t x) * task->thread.sve_state. * * Task can be a non-runnable task, or current. In the latter case, - * softirqs (and preemption) must be disabled. + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. * task->thread.sve_state must point to at least sve_state_size(task) * bytes of allocated kernel memory. * task->thread.uw.fpsimd_state must be up to date before calling this @@ -393,7 +441,8 @@ static void fpsimd_to_sve(struct task_struct *task) * task->thread.uw.fpsimd_state. * * Task can be a non-runnable task, or current. In the latter case, - * softirqs (and preemption) must be disabled. + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. * task->thread.sve_state must point to at least sve_state_size(task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. @@ -557,7 +606,7 @@ int sve_set_vector_length(struct task_struct *task, * non-SVE thread. */ if (task == current) { - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_save(); } @@ -567,7 +616,7 @@ int sve_set_vector_length(struct task_struct *task, sve_to_fpsimd(task); if (task == current) - local_bh_enable(); + put_cpu_fpsimd_context(); /* * Force reallocation of task SVE state to the correct size @@ -880,7 +929,7 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) sve_alloc(current); - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_save(); @@ -891,7 +940,7 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -935,6 +984,8 @@ void fpsimd_thread_switch(struct task_struct *next) if (!system_supports_fpsimd()) return; + __get_cpu_fpsimd_context(); + /* Save unsaved fpsimd state, if any: */ fpsimd_save(); @@ -949,6 +1000,8 @@ void fpsimd_thread_switch(struct task_struct *next) update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, wrong_task || wrong_cpu); + + __put_cpu_fpsimd_context(); } void fpsimd_flush_thread(void) @@ -958,7 +1011,7 @@ void fpsimd_flush_thread(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_flush_task_state(current); memset(¤t->thread.uw.fpsimd_state, 0, @@ -999,7 +1052,7 @@ void fpsimd_flush_thread(void) current->thread.sve_vl_onexec = 0; } - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1011,9 +1064,9 @@ void fpsimd_preserve_current_state(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); fpsimd_save(); - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1030,7 +1083,8 @@ void fpsimd_signal_preserve_current_state(void) /* * Associate current's FPSIMD context with this cpu - * Preemption must be disabled when calling this function. + * The caller must have ownership of the cpu FPSIMD context before calling + * this function. */ void fpsimd_bind_task_to_cpu(void) { @@ -1076,14 +1130,14 @@ void fpsimd_restore_current_state(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { task_fpsimd_load(); fpsimd_bind_task_to_cpu(); } - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1096,7 +1150,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); current->thread.uw.fpsimd_state = *state; if (system_supports_sve() && test_thread_flag(TIF_SVE)) @@ -1107,7 +1161,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) clear_thread_flag(TIF_FOREIGN_FPSTATE); - local_bh_enable(); + put_cpu_fpsimd_context(); } /* @@ -1133,18 +1187,29 @@ void fpsimd_flush_task_state(struct task_struct *t) /* * Invalidate any task's FPSIMD state that is present on this cpu. - * This function must be called with softirqs disabled. + * The FPSIMD context should be acquired with get_cpu_fpsimd_context() + * before calling this function. */ -void fpsimd_flush_cpu_state(void) +static void fpsimd_flush_cpu_state(void) { __this_cpu_write(fpsimd_last_state.st, NULL); set_thread_flag(TIF_FOREIGN_FPSTATE); } -#ifdef CONFIG_KERNEL_MODE_NEON +/* + * Save the FPSIMD state to memory and invalidate cpu view. + * This function must be called with preemption disabled. + */ +void fpsimd_save_and_flush_cpu_state(void) +{ + WARN_ON(preemptible()); + __get_cpu_fpsimd_context(); + fpsimd_save(); + fpsimd_flush_cpu_state(); + __put_cpu_fpsimd_context(); +} -DEFINE_PER_CPU(bool, kernel_neon_busy); -EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); +#ifdef CONFIG_KERNEL_MODE_NEON /* * Kernel-side NEON support functions @@ -1170,19 +1235,13 @@ void kernel_neon_begin(void) BUG_ON(!may_use_simd()); - local_bh_disable(); - - __this_cpu_write(kernel_neon_busy, true); + get_cpu_fpsimd_context(); /* Save unsaved fpsimd state, if any: */ fpsimd_save(); /* Invalidate any task state remaining in the fpsimd regs: */ fpsimd_flush_cpu_state(); - - preempt_disable(); - - local_bh_enable(); } EXPORT_SYMBOL(kernel_neon_begin); @@ -1197,15 +1256,10 @@ EXPORT_SYMBOL(kernel_neon_begin); */ void kernel_neon_end(void) { - bool busy; - if (!system_supports_fpsimd()) return; - busy = __this_cpu_xchg(kernel_neon_busy, false); - WARN_ON(!busy); /* No matching kernel_neon_begin()? */ - - preempt_enable(); + put_cpu_fpsimd_context(); } EXPORT_SYMBOL(kernel_neon_end); @@ -1297,8 +1351,7 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, { switch (cmd) { case CPU_PM_ENTER: - fpsimd_save(); - fpsimd_flush_cpu_state(); + fpsimd_save_and_flush_cpu_state(); break; case CPU_PM_EXIT: break; diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 04ca08086d35..2b85c0d6fa3d 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -67,7 +67,11 @@ #ifdef CONFIG_EFI -__efistub_stext_offset = stext - _text; +/* + * Use ABSOLUTE() to avoid ld.lld treating this as a relative symbol: + * https://github.com/ClangBuiltLinux/linux/issues/561 + */ +__efistub_stext_offset = ABSOLUTE(stext - _text); /* * The EFI stub has its own symbol namespace prefixed by __efistub_, to diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index c70034fbd4ce..04a327ccf84d 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -16,8 +16,10 @@ #include <linux/smp.h> #include <linux/init.h> #include <linux/irqchip.h> +#include <linux/kprobes.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <asm/daifflags.h> #include <asm/vmap_stack.h> unsigned long irq_err_count; @@ -64,4 +66,28 @@ void __init init_IRQ(void) irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); + + if (system_uses_irq_prio_masking()) { + /* + * Now that we have a stack for our IRQ handler, set + * the PMR/PSR pair to a consistent state. + */ + WARN_ON(read_sysreg(daif) & PSR_A_BIT); + local_daif_restore(DAIF_PROCCTX_NOIRQ); + } +} + +/* + * Stubs to make nmi_enter/exit() code callable from ASM + */ +asmlinkage void notrace asm_nmi_enter(void) +{ + nmi_enter(); +} +NOKPROBE_SYMBOL(asm_nmi_enter); + +asmlinkage void notrace asm_nmi_exit(void) +{ + nmi_exit(); } +NOKPROBE_SYMBOL(asm_nmi_exit); diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index 07bf740bea91..2514fd6f12cb 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -53,7 +53,7 @@ static void *image_load(struct kimage *image, /* * We require a kernel with an unambiguous Image header. Per - * Documentation/booting.txt, this is the case when image_size + * Documentation/arm64/booting.rst, this is the case when image_size * is non-zero (practically speaking, since v3.17). */ h = (struct arm64_image_header *)kernel; diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index e23a68a5808f..46e643e30708 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -21,6 +21,7 @@ void *module_alloc(unsigned long size) { + u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; gfp_t gfp_mask = GFP_KERNEL; void *p; @@ -28,9 +29,12 @@ void *module_alloc(unsigned long size) if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) gfp_mask |= __GFP_NOWARN; + if (IS_ENABLED(CONFIG_KASAN)) + /* don't exceed the static module region - see below */ + module_alloc_end = MODULES_END; + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_base + MODULES_VSIZE, - gfp_mask, PAGE_KERNEL_EXEC, 0, + module_alloc_end, gfp_mask, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && @@ -46,7 +50,7 @@ void *module_alloc(unsigned long size) */ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, module_alloc_base + SZ_2G, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 88ce502c8e6f..bd5dfffca272 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -122,8 +122,10 @@ void *alloc_insn_page(void) void *page; page = vmalloc_exec(PAGE_SIZE); - if (page) + if (page) { set_memory_ro((unsigned long)page, 1); + set_vm_flush_reset_perms(page); + } return page; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 9856395ccdb7..6a869d9f304f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -83,7 +83,7 @@ static void __cpu_do_idle_irqprio(void) * be raised. */ pmr = gic_read_pmr(); - gic_write_pmr(GIC_PRIO_IRQON); + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); __cpu_do_idle(); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index da2441d7b066..3cf3b135027e 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1808,8 +1808,12 @@ static void tracehook_report_syscall(struct pt_regs *regs, int syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE)) + if (test_thread_flag(TIF_SYSCALL_TRACE) || + test_thread_flag(TIF_SYSCALL_EMU)) { tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); + if (!in_syscall(regs) || test_thread_flag(TIF_SYSCALL_EMU)) + return -1; + } /* Do the secure computing after ptrace; failures should be fast. */ if (secure_computing(NULL) == -1) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 7e541f947b4c..9c4bad7d7131 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -283,6 +283,11 @@ void __init setup_arch(char **cmdline_p) setup_machine_fdt(__fdt_pointer); + /* + * Initialise the static keys early as they may be enabled by the + * cpufeature code and early parameters. + */ + jump_label_init(); parse_early_param(); /* diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 331d1e5acad4..12a585386c2f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -18,42 +18,7 @@ #include <asm/traps.h> #include <linux/uaccess.h> #include <asm/unistd.h> - -struct compat_sigcontext { - /* We always set these two fields to 0 */ - compat_ulong_t trap_no; - compat_ulong_t error_code; - - compat_ulong_t oldmask; - compat_ulong_t arm_r0; - compat_ulong_t arm_r1; - compat_ulong_t arm_r2; - compat_ulong_t arm_r3; - compat_ulong_t arm_r4; - compat_ulong_t arm_r5; - compat_ulong_t arm_r6; - compat_ulong_t arm_r7; - compat_ulong_t arm_r8; - compat_ulong_t arm_r9; - compat_ulong_t arm_r10; - compat_ulong_t arm_fp; - compat_ulong_t arm_ip; - compat_ulong_t arm_sp; - compat_ulong_t arm_lr; - compat_ulong_t arm_pc; - compat_ulong_t arm_cpsr; - compat_ulong_t fault_address; -}; - -struct compat_ucontext { - compat_ulong_t uc_flags; - compat_uptr_t uc_link; - compat_stack_t uc_stack; - struct compat_sigcontext uc_mcontext; - compat_sigset_t uc_sigmask; - int __unused[32 - (sizeof (compat_sigset_t) / sizeof (int))]; - compat_ulong_t uc_regspace[128] __attribute__((__aligned__(8))); -}; +#include <asm/vdso.h> struct compat_vfp_sigframe { compat_ulong_t magic; @@ -81,16 +46,6 @@ struct compat_aux_sigframe { unsigned long end_magic; } __attribute__((__aligned__(8))); -struct compat_sigframe { - struct compat_ucontext uc; - compat_ulong_t retcode[2]; -}; - -struct compat_rt_sigframe { - struct compat_siginfo info; - struct compat_sigframe sig; -}; - #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) @@ -387,6 +342,30 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, retcode = ptr_to_compat(ka->sa.sa_restorer); } else { /* Set up sigreturn pointer */ +#ifdef CONFIG_COMPAT_VDSO + void *vdso_base = current->mm->context.vdso; + void *vdso_trampoline; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (thumb) { + vdso_trampoline = VDSO_SYMBOL(vdso_base, + compat_rt_sigreturn_thumb); + } else { + vdso_trampoline = VDSO_SYMBOL(vdso_base, + compat_rt_sigreturn_arm); + } + } else { + if (thumb) { + vdso_trampoline = VDSO_SYMBOL(vdso_base, + compat_sigreturn_thumb); + } else { + vdso_trampoline = VDSO_SYMBOL(vdso_base, + compat_sigreturn_arm); + } + } + + retcode = ptr_to_compat(vdso_trampoline) + thumb; +#else unsigned int idx = thumb << 1; if (ka->sa.sa_flags & SA_SIGINFO) @@ -394,6 +373,7 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, retcode = (unsigned long)current->mm->context.vdso + (idx << 2) + thumb; +#endif } regs->regs[0] = usig; diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 3e53ffa07994..f5b04dd8a710 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -27,7 +27,7 @@ * aff0 = mpidr_masked & 0xff; * aff1 = mpidr_masked & 0xff00; * aff2 = mpidr_masked & 0xff0000; - * aff2 = mpidr_masked & 0xff00000000; + * aff3 = mpidr_masked & 0xff00000000; * dst = (aff0 >> rs0 | aff1 >> rs1 | aff2 >> rs2 | aff3 >> rs3); *} * Input registers: rs0, rs1, rs2, rs3, mpidr, mask diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6dcf9607d770..ea90d3bd9253 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -181,11 +181,7 @@ static void init_gic_priority_masking(void) WARN_ON(!(cpuflags & PSR_I_BIT)); - gic_write_pmr(GIC_PRIO_IRQOFF); - - /* We can only unmask PSR.I if we can take aborts */ - if (!(cpuflags & PSR_A_BIT)) - write_sysreg(cpuflags & ~PSR_I_BIT, daif); + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } /* @@ -424,11 +420,6 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* - * Initialise the static keys early as they may be enabled by the - * cpufeature code. - */ - jump_label_init(); cpuinfo_store_boot_cpu(); /* @@ -834,18 +825,23 @@ void arch_irq_work_raise(void) } #endif -/* - * ipi_cpu_stop - handle IPI from smp_send_stop() - */ -static void ipi_cpu_stop(unsigned int cpu) +static void local_cpu_stop(void) { - set_cpu_online(cpu, false); + set_cpu_online(smp_processor_id(), false); local_daif_mask(); sdei_mask_local_cpu(); + cpu_park_loop(); +} - while (1) - cpu_relax(); +/* + * We need to implement panic_smp_self_stop() for parallel panic() calls, so + * that cpu_online_mask gets correctly updated and smp_send_stop() can skip + * CPUs that have already stopped themselves. + */ +void panic_smp_self_stop(void) +{ + local_cpu_stop(); } #ifdef CONFIG_KEXEC_CORE @@ -898,7 +894,7 @@ void handle_IPI(int ipinr, struct pt_regs *regs) case IPI_CPU_STOP: irq_enter(); - ipi_cpu_stop(cpu); + local_cpu_stop(); irq_exit(); break; diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 985721a1264c..8c03456dade6 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -55,16 +55,19 @@ static void dump_backtrace_entry(unsigned long where) printk(" %pS\n", (void *)where); } -static void __dump_instr(const char *lvl, struct pt_regs *regs) +static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; + if (user_mode(regs)) + return; + for (i = -4; i < 1; i++) { unsigned int val, bad; - bad = get_user(val, &((u32 *)addr)[i]); + bad = aarch64_insn_read(&((u32 *)addr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); @@ -73,19 +76,8 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs) break; } } - printk("%sCode: %s\n", lvl, str); -} -static void dump_instr(const char *lvl, struct pt_regs *regs) -{ - if (!user_mode(regs)) { - mm_segment_t fs = get_fs(); - set_fs(KERNEL_DS); - __dump_instr(lvl, regs); - set_fs(fs); - } else { - __dump_instr(lvl, regs); - } + printk("%sCode: %s\n", lvl, str); } void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) @@ -171,8 +163,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) print_modules(); show_regs(regs); - if (!user_mode(regs)) - dump_instr(KERN_EMERG, regs); + dump_kernel_instr(KERN_EMERG, regs); return ret; } @@ -242,16 +233,16 @@ void arm64_force_sig_fault(int signo, int code, void __user *addr, { arm64_show_signal(signo, str); if (signo == SIGKILL) - force_sig(SIGKILL, current); + force_sig(SIGKILL); else - force_sig_fault(signo, code, addr, current); + force_sig_fault(signo, code, addr); } void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); - force_sig_mceerr(code, addr, lsb, current); + force_sig_mceerr(code, addr, lsb); } void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, @@ -880,6 +871,10 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) /* * The CPU can't make progress. The exception may have * been imprecise. + * + * Neoverse-N1 #1349291 means a non-KVM SError reported as + * Unrecoverable should be treated as Uncontainable. We + * call arm64_serror_panic() in both cases. */ return true; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 663b166241d0..354b11e27c07 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -20,41 +20,212 @@ #include <linux/slab.h> #include <linux/timekeeper_internal.h> #include <linux/vmalloc.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> +#include <vdso/vsyscall.h> #include <asm/cacheflush.h> #include <asm/signal32.h> #include <asm/vdso.h> -#include <asm/vdso_datapage.h> extern char vdso_start[], vdso_end[]; -static unsigned long vdso_pages __ro_after_init; +#ifdef CONFIG_COMPAT_VDSO +extern char vdso32_start[], vdso32_end[]; +#endif /* CONFIG_COMPAT_VDSO */ + +/* vdso_lookup arch_index */ +enum arch_vdso_type { + ARM64_VDSO = 0, +#ifdef CONFIG_COMPAT_VDSO + ARM64_VDSO32 = 1, +#endif /* CONFIG_COMPAT_VDSO */ +}; +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_TYPES (ARM64_VDSO32 + 1) +#else +#define VDSO_TYPES (ARM64_VDSO + 1) +#endif /* CONFIG_COMPAT_VDSO */ + +struct __vdso_abi { + const char *name; + const char *vdso_code_start; + const char *vdso_code_end; + unsigned long vdso_pages; + /* Data Mapping */ + struct vm_special_mapping *dm; + /* Code Mapping */ + struct vm_special_mapping *cm; +}; + +static struct __vdso_abi vdso_lookup[VDSO_TYPES] __ro_after_init = { + { + .name = "vdso", + .vdso_code_start = vdso_start, + .vdso_code_end = vdso_end, + }, +#ifdef CONFIG_COMPAT_VDSO + { + .name = "vdso32", + .vdso_code_start = vdso32_start, + .vdso_code_end = vdso32_end, + }, +#endif /* CONFIG_COMPAT_VDSO */ +}; /* * The vDSO data page. */ static union { - struct vdso_data data; + struct vdso_data data[CS_BASES]; u8 page[PAGE_SIZE]; } vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +struct vdso_data *vdso_data = vdso_data_store.data; + +static int __vdso_remap(enum arch_vdso_type arch_index, + const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + unsigned long vdso_size = vdso_lookup[arch_index].vdso_code_end - + vdso_lookup[arch_index].vdso_code_start; + + if (vdso_size != new_size) + return -EINVAL; + + current->mm->context.vdso = (void *)new_vma->vm_start; + + return 0; +} + +static int __vdso_init(enum arch_vdso_type arch_index) +{ + int i; + struct page **vdso_pagelist; + unsigned long pfn; + + if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) { + pr_err("vDSO is not a valid ELF object!\n"); + return -EINVAL; + } + + vdso_lookup[arch_index].vdso_pages = ( + vdso_lookup[arch_index].vdso_code_end - + vdso_lookup[arch_index].vdso_code_start) >> + PAGE_SHIFT; + + /* Allocate the vDSO pagelist, plus a page for the data. */ + vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1, + sizeof(struct page *), + GFP_KERNEL); + if (vdso_pagelist == NULL) + return -ENOMEM; + + /* Grab the vDSO data page. */ + vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); + + + /* Grab the vDSO code pages. */ + pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start); + + for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++) + vdso_pagelist[i + 1] = pfn_to_page(pfn + i); + + vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0]; + vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1]; + + return 0; +} + +static int __setup_additional_pages(enum arch_vdso_type arch_index, + struct mm_struct *mm, + struct linux_binprm *bprm, + int uses_interp) +{ + unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + void *ret; + + vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT; + /* Be sure to map the data page */ + vdso_mapping_len = vdso_text_len + PAGE_SIZE; + + vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + if (IS_ERR_VALUE(vdso_base)) { + ret = ERR_PTR(vdso_base); + goto up_fail; + } + + ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, + VM_READ|VM_MAYREAD, + vdso_lookup[arch_index].dm); + if (IS_ERR(ret)) + goto up_fail; + + vdso_base += PAGE_SIZE; + mm->context.vdso = (void *)vdso_base; + ret = _install_special_mapping(mm, vdso_base, vdso_text_len, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso_lookup[arch_index].cm); + if (IS_ERR(ret)) + goto up_fail; + + return 0; + +up_fail: + mm->context.vdso = NULL; + return PTR_ERR(ret); +} #ifdef CONFIG_COMPAT /* * Create and map the vectors page for AArch32 tasks. */ +#ifdef CONFIG_COMPAT_VDSO +static int aarch32_vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + return __vdso_remap(ARM64_VDSO32, sm, new_vma); +} +#endif /* CONFIG_COMPAT_VDSO */ + +/* + * aarch32_vdso_pages: + * 0 - kuser helpers + * 1 - sigreturn code + * or (CONFIG_COMPAT_VDSO): + * 0 - kuser helpers + * 1 - vdso data + * 2 - vdso code + */ #define C_VECTORS 0 +#ifdef CONFIG_COMPAT_VDSO +#define C_VVAR 1 +#define C_VDSO 2 +#define C_PAGES (C_VDSO + 1) +#else #define C_SIGPAGE 1 #define C_PAGES (C_SIGPAGE + 1) +#endif /* CONFIG_COMPAT_VDSO */ static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init; -static const struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { +static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { { .name = "[vectors]", /* ABI */ .pages = &aarch32_vdso_pages[C_VECTORS], }, +#ifdef CONFIG_COMPAT_VDSO + { + .name = "[vvar]", + }, + { + .name = "[vdso]", + .mremap = aarch32_vdso_mremap, + }, +#else { .name = "[sigpage]", /* ABI */ .pages = &aarch32_vdso_pages[C_SIGPAGE], }, +#endif /* CONFIG_COMPAT_VDSO */ }; static int aarch32_alloc_kuser_vdso_page(void) @@ -77,7 +248,33 @@ static int aarch32_alloc_kuser_vdso_page(void) return 0; } -static int __init aarch32_alloc_vdso_pages(void) +#ifdef CONFIG_COMPAT_VDSO +static int __aarch32_alloc_vdso_pages(void) +{ + int ret; + + vdso_lookup[ARM64_VDSO32].dm = &aarch32_vdso_spec[C_VVAR]; + vdso_lookup[ARM64_VDSO32].cm = &aarch32_vdso_spec[C_VDSO]; + + ret = __vdso_init(ARM64_VDSO32); + if (ret) + return ret; + + ret = aarch32_alloc_kuser_vdso_page(); + if (ret) { + unsigned long c_vvar = + (unsigned long)page_to_virt(aarch32_vdso_pages[C_VVAR]); + unsigned long c_vdso = + (unsigned long)page_to_virt(aarch32_vdso_pages[C_VDSO]); + + free_page(c_vvar); + free_page(c_vdso); + } + + return ret; +} +#else +static int __aarch32_alloc_vdso_pages(void) { extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; @@ -98,6 +295,12 @@ static int __init aarch32_alloc_vdso_pages(void) return ret; } +#endif /* CONFIG_COMPAT_VDSO */ + +static int __init aarch32_alloc_vdso_pages(void) +{ + return __aarch32_alloc_vdso_pages(); +} arch_initcall(aarch32_alloc_vdso_pages); static int aarch32_kuser_helpers_setup(struct mm_struct *mm) @@ -119,6 +322,7 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) return PTR_ERR_OR_ZERO(ret); } +#ifndef CONFIG_COMPAT_VDSO static int aarch32_sigreturn_setup(struct mm_struct *mm) { unsigned long addr; @@ -146,6 +350,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) out: return PTR_ERR_OR_ZERO(ret); } +#endif /* !CONFIG_COMPAT_VDSO */ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { @@ -159,7 +364,14 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (ret) goto out; +#ifdef CONFIG_COMPAT_VDSO + ret = __setup_additional_pages(ARM64_VDSO32, + mm, + bprm, + uses_interp); +#else ret = aarch32_sigreturn_setup(mm); +#endif /* CONFIG_COMPAT_VDSO */ out: up_write(&mm->mmap_sem); @@ -170,18 +382,18 @@ out: static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_end - vdso_start; - - if (vdso_size != new_size) - return -EINVAL; - - current->mm->context.vdso = (void *)new_vma->vm_start; - - return 0; + return __vdso_remap(ARM64_VDSO, sm, new_vma); } -static struct vm_special_mapping vdso_spec[2] __ro_after_init = { +/* + * aarch64_vdso_pages: + * 0 - vvar + * 1 - vdso + */ +#define A_VVAR 0 +#define A_VDSO 1 +#define A_PAGES (A_VDSO + 1) +static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { { .name = "[vvar]", }, @@ -193,37 +405,10 @@ static struct vm_special_mapping vdso_spec[2] __ro_after_init = { static int __init vdso_init(void) { - int i; - struct page **vdso_pagelist; - unsigned long pfn; - - if (memcmp(vdso_start, "\177ELF", 4)) { - pr_err("vDSO is not a valid ELF object!\n"); - return -EINVAL; - } - - vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; - - /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), - GFP_KERNEL); - if (vdso_pagelist == NULL) - return -ENOMEM; - - /* Grab the vDSO data page. */ - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); - - - /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(vdso_start); - - for (i = 0; i < vdso_pages; i++) - vdso_pagelist[i + 1] = pfn_to_page(pfn + i); + vdso_lookup[ARM64_VDSO].dm = &vdso_spec[A_VVAR]; + vdso_lookup[ARM64_VDSO].cm = &vdso_spec[A_VDSO]; - vdso_spec[0].pages = &vdso_pagelist[0]; - vdso_spec[1].pages = &vdso_pagelist[1]; - - return 0; + return __vdso_init(ARM64_VDSO); } arch_initcall(vdso_init); @@ -231,84 +416,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; - unsigned long vdso_base, vdso_text_len, vdso_mapping_len; - void *ret; - - vdso_text_len = vdso_pages << PAGE_SHIFT; - /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + int ret; if (down_write_killable(&mm->mmap_sem)) return -EINTR; - vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); - if (IS_ERR_VALUE(vdso_base)) { - ret = ERR_PTR(vdso_base); - goto up_fail; - } - ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, - VM_READ|VM_MAYREAD, - &vdso_spec[0]); - if (IS_ERR(ret)) - goto up_fail; - - vdso_base += PAGE_SIZE; - mm->context.vdso = (void *)vdso_base; - ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_spec[1]); - if (IS_ERR(ret)) - goto up_fail; + ret = __setup_additional_pages(ARM64_VDSO, + mm, + bprm, + uses_interp); up_write(&mm->mmap_sem); - return 0; - -up_fail: - mm->context.vdso = NULL; - up_write(&mm->mmap_sem); - return PTR_ERR(ret); -} -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ -void update_vsyscall(struct timekeeper *tk) -{ - u32 use_syscall = !tk->tkr_mono.clock->archdata.vdso_direct; - - ++vdso_data->tb_seq_count; - smp_wmb(); - - vdso_data->use_syscall = use_syscall; - vdso_data->xtime_coarse_sec = tk->xtime_sec; - vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift; - vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; - vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; - - /* Read without the seqlock held by clock_getres() */ - WRITE_ONCE(vdso_data->hrtimer_res, hrtimer_resolution); - - if (!use_syscall) { - /* tkr_mono.cycle_last == tkr_raw.cycle_last */ - vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; - vdso_data->raw_time_sec = tk->raw_sec; - vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; - vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->cs_mono_mult = tk->tkr_mono.mult; - vdso_data->cs_raw_mult = tk->tkr_raw.mult; - /* tkr_mono.shift == tkr_raw.shift */ - vdso_data->cs_shift = tk->tkr_mono.shift; - } - - smp_wmb(); - ++vdso_data->tb_seq_count; -} - -void update_vsyscall_tz(void) -{ - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; + return ret; } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index fa230ff09aa1..4ab863045188 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -6,7 +6,12 @@ # Heavily based on the vDSO Makefiles for other archs. # -obj-vdso := gettimeofday.o note.o sigreturn.o +# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before +# the inclusion of generic Makefile. +ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64 +include $(srctree)/lib/vdso/Makefile + +obj-vdso := vgettimeofday.o note.o sigreturn.o # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg @@ -15,6 +20,31 @@ obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ --build-id -n -T +ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 +ccflags-y += -DDISABLE_BRANCH_PROFILING + +VDSO_LDFLAGS := -Bsymbolic + +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os +KBUILD_CFLAGS += $(DISABLE_LTO) +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y +KCOV_INSTRUMENT := n + +ifeq ($(c-gettimeofday-y),) +CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny +else +CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -include $(c-gettimeofday-y) +endif + +# Clang versions less than 8 do not support -mcmodel=tiny +ifeq ($(CONFIG_CC_IS_CLANG), y) + ifeq ($(shell test $(CONFIG_CLANG_VERSION) -lt 80000; echo $$?),0) + CFLAGS_REMOVE_vgettimeofday.o += -mcmodel=tiny + endif +endif + # Disable gcov profiling for VDSO code GCOV_PROFILE := n @@ -28,6 +58,7 @@ $(obj)/vdso.o : $(obj)/vdso.so # Link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,ld) + $(call if_changed,vdso_check) # Strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -42,13 +73,9 @@ quiet_cmd_vdsosym = VDSOSYM $@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) -# Assembly rules for the .S files -$(obj-vdso): %.o: %.S FORCE - $(call if_changed_dep,vdsoas) - # Actual build commands -quiet_cmd_vdsoas = VDSOA $@ - cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdsocc = VDSOCC $@ + cmd_vdsocc = $(CC) $(a_flags) $(c_flags) -c -o $@ $< # Install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index 80f780f56e0d..e69de29bb2d1 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -1,323 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Userspace implementations of gettimeofday() and friends. - * - * Copyright (C) 2012 ARM Limited - * - * Author: Will Deacon <will.deacon@arm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> - -#define NSEC_PER_SEC_LO16 0xca00 -#define NSEC_PER_SEC_HI16 0x3b9a - -vdso_data .req x6 -seqcnt .req w7 -w_tmp .req w8 -x_tmp .req x8 - -/* - * Conventions for macro arguments: - * - An argument is write-only if its name starts with "res". - * - All other arguments are read-only, unless otherwise specified. - */ - - .macro seqcnt_acquire -9999: ldr seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT] - tbnz seqcnt, #0, 9999b - dmb ishld - .endm - - .macro seqcnt_check fail - dmb ishld - ldr w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT] - cmp w_tmp, seqcnt - b.ne \fail - .endm - - .macro syscall_check fail - ldr w_tmp, [vdso_data, #VDSO_USE_SYSCALL] - cbnz w_tmp, \fail - .endm - - .macro get_nsec_per_sec res - mov \res, #NSEC_PER_SEC_LO16 - movk \res, #NSEC_PER_SEC_HI16, lsl #16 - .endm - - /* - * Returns the clock delta, in nanoseconds left-shifted by the clock - * shift. - */ - .macro get_clock_shifted_nsec res, cycle_last, mult - /* Read the virtual counter. */ - isb - mrs x_tmp, cntvct_el0 - /* Calculate cycle delta and convert to ns. */ - sub \res, x_tmp, \cycle_last - /* We can only guarantee 56 bits of precision. */ - movn x_tmp, #0xff00, lsl #48 - and \res, x_tmp, \res - mul \res, \res, \mult - /* - * Fake address dependency from the value computed from the counter - * register to subsequent data page accesses so that the sequence - * locking also orders the read of the counter. - */ - and x_tmp, \res, xzr - add vdso_data, vdso_data, x_tmp - .endm - - /* - * Returns in res_{sec,nsec} the REALTIME timespec, based on the - * "wall time" (xtime) and the clock_mono delta. - */ - .macro get_ts_realtime res_sec, res_nsec, \ - clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec - add \res_nsec, \clock_nsec, \xtime_nsec - udiv x_tmp, \res_nsec, \nsec_to_sec - add \res_sec, \xtime_sec, x_tmp - msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec - .endm - - /* - * Returns in res_{sec,nsec} the timespec based on the clock_raw delta, - * used for CLOCK_MONOTONIC_RAW. - */ - .macro get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec - udiv \res_sec, \clock_nsec, \nsec_to_sec - msub \res_nsec, \res_sec, \nsec_to_sec, \clock_nsec - .endm - - /* sec and nsec are modified in place. */ - .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec - /* Add timespec. */ - add \sec, \sec, \ts_sec - add \nsec, \nsec, \ts_nsec - - /* Normalise the new timespec. */ - cmp \nsec, \nsec_to_sec - b.lt 9999f - sub \nsec, \nsec, \nsec_to_sec - add \sec, \sec, #1 -9999: - cmp \nsec, #0 - b.ge 9998f - add \nsec, \nsec, \nsec_to_sec - sub \sec, \sec, #1 -9998: - .endm - - .macro clock_gettime_return, shift=0 - .if \shift == 1 - lsr x11, x11, x12 - .endif - stp x10, x11, [x1, #TSPEC_TV_SEC] - mov x0, xzr - ret - .endm - - .macro jump_slot jumptable, index, label - .if (. - \jumptable) != 4 * (\index) - .error "Jump slot index mismatch" - .endif - b \label - .endm - - .text - -/* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */ -ENTRY(__kernel_gettimeofday) - .cfi_startproc - adr vdso_data, _vdso_data - /* If tv is NULL, skip to the timezone code. */ - cbz x0, 2f - - /* Compute the time of day. */ -1: seqcnt_acquire - syscall_check fail=4f - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - seqcnt_check fail=1b - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - - /* Convert ns to us. */ - mov x13, #1000 - lsl x13, x13, x12 - udiv x11, x11, x13 - stp x10, x11, [x0, #TVAL_TV_SEC] -2: - /* If tz is NULL, return 0. */ - cbz x1, 3f - ldp w4, w5, [vdso_data, #VDSO_TZ_MINWEST] - stp w4, w5, [x1, #TZ_MINWEST] -3: - mov x0, xzr - ret -4: - /* Syscall fallback. */ - mov x8, #__NR_gettimeofday - svc #0 - ret - .cfi_endproc -ENDPROC(__kernel_gettimeofday) - -#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE - -/* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */ -ENTRY(__kernel_clock_gettime) - .cfi_startproc - cmp w0, #JUMPSLOT_MAX - b.hi syscall - adr vdso_data, _vdso_data - adr x_tmp, jumptable - add x_tmp, x_tmp, w0, uxtw #2 - br x_tmp - - ALIGN -jumptable: - jump_slot jumptable, CLOCK_REALTIME, realtime - jump_slot jumptable, CLOCK_MONOTONIC, monotonic - b syscall - b syscall - jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw - jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse - jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse - - .if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1) - .error "Wrong jumptable size" - .endif - - ALIGN -realtime: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - - /* All computations are done with left-shifted nsecs. */ - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - seqcnt_check fail=realtime - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -monotonic: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] - - /* All computations are done with left-shifted nsecs. */ - lsl x4, x4, x12 - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - seqcnt_check fail=monotonic - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - - add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -monotonic_raw: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_raw_mult, w12 = cs_shift */ - ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] - ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] - - /* All computations are done with left-shifted nsecs. */ - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - seqcnt_check fail=monotonic_raw - get_ts_clock_raw res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, nsec_to_sec=x9 - - add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -realtime_coarse: - seqcnt_acquire - ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] - seqcnt_check fail=realtime_coarse - clock_gettime_return - - ALIGN -monotonic_coarse: - seqcnt_acquire - ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] - ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic_coarse - - /* Computations are done in (non-shifted) nsecs. */ - get_nsec_per_sec res=x9 - add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return - - ALIGN -syscall: /* Syscall fallback. */ - mov x8, #__NR_clock_gettime - svc #0 - ret - .cfi_endproc -ENDPROC(__kernel_clock_gettime) - -/* int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); */ -ENTRY(__kernel_clock_getres) - .cfi_startproc - cmp w0, #CLOCK_REALTIME - ccmp w0, #CLOCK_MONOTONIC, #0x4, ne - ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne - b.ne 1f - - adr vdso_data, _vdso_data - ldr w2, [vdso_data, #CLOCK_REALTIME_RES] - b 2f -1: - cmp w0, #CLOCK_REALTIME_COARSE - ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne - b.ne 4f - ldr x2, 5f -2: - cbz x1, 3f - stp xzr, x2, [x1] - -3: /* res == NULL. */ - mov w0, wzr - ret - -4: /* Syscall fallback. */ - mov x8, #__NR_clock_getres - svc #0 - ret -5: - .quad CLOCK_COARSE_RES - .cfi_endproc -ENDPROC(__kernel_clock_getres) diff --git a/arch/arm64/kernel/vdso/vgettimeofday.c b/arch/arm64/kernel/vdso/vgettimeofday.c new file mode 100644 index 000000000000..747635501a14 --- /dev/null +++ b/arch/arm64/kernel/vdso/vgettimeofday.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 userspace implementations of gettimeofday() and similar. + * + * Copyright (C) 2018 ARM Limited + * + */ +#include <linux/time.h> +#include <linux/types.h> + +int __kernel_clock_gettime(clockid_t clock, + struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __kernel_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __kernel_clock_getres(clockid_t clock_id, + struct __kernel_timespec *res) +{ + return __cvdso_clock_getres(clock_id, res); +} diff --git a/arch/arm64/kernel/vdso32/.gitignore b/arch/arm64/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..4fea950fa5ed --- /dev/null +++ b/arch/arm64/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +vdso.lds +vdso.so.raw diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile new file mode 100644 index 000000000000..288c14d30b45 --- /dev/null +++ b/arch/arm64/kernel/vdso32/Makefile @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for vdso32 +# + +# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before +# the inclusion of generic Makefile. +ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 +include $(srctree)/lib/vdso/Makefile + +COMPATCC := $(CROSS_COMPILE_COMPAT)gcc + +# Same as cc-*option, but using COMPATCC instead of CC +cc32-option = $(call try-run,\ + $(COMPATCC) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) +cc32-disable-warning = $(call try-run,\ + $(COMPATCC) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) +cc32-ldoption = $(call try-run,\ + $(COMPATCC) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2)) + +# We cannot use the global flags to compile the vDSO files, the main reason +# being that the 32-bit compiler may be older than the main (64-bit) compiler +# and therefore may not understand flags set using $(cc-option ...). Besides, +# arch-specific options should be taken from the arm Makefile instead of the +# arm64 one. +# As a result we set our own flags here. + +# From top-level Makefile +# NOSTDINC_FLAGS +VDSO_CPPFLAGS := -nostdinc -isystem $(shell $(COMPATCC) -print-file-name=include) +VDSO_CPPFLAGS += $(LINUXINCLUDE) +VDSO_CPPFLAGS += $(KBUILD_CPPFLAGS) + +# Common C and assembly flags +# From top-level Makefile +VDSO_CAFLAGS := $(VDSO_CPPFLAGS) +VDSO_CAFLAGS += $(call cc32-option,-fno-PIE) +ifdef CONFIG_DEBUG_INFO +VDSO_CAFLAGS += -g +endif +ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(COMPATCC)), y) +VDSO_CAFLAGS += -DCC_HAVE_ASM_GOTO +endif + +# From arm Makefile +VDSO_CAFLAGS += $(call cc32-option,-fno-dwarf2-cfi-asm) +VDSO_CAFLAGS += -mabi=aapcs-linux -mfloat-abi=soft +ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) +VDSO_CAFLAGS += -mbig-endian +else +VDSO_CAFLAGS += -mlittle-endian +endif + +# From arm vDSO Makefile +VDSO_CAFLAGS += -fPIC -fno-builtin -fno-stack-protector +VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING + +# Try to compile for ARMv8. If the compiler is too old and doesn't support it, +# fall back to v7. There is no easy way to check for what architecture the code +# is being compiled, so define a macro specifying that (see arch/arm/Makefile). +VDSO_CAFLAGS += $(call cc32-option,-march=armv8-a -D__LINUX_ARM_ARCH__=8,\ + -march=armv7-a -D__LINUX_ARM_ARCH__=7) + +VDSO_CFLAGS := $(VDSO_CAFLAGS) +VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 +# KBUILD_CFLAGS from top-level Makefile +VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ + -fno-strict-aliasing -fno-common \ + -Werror-implicit-function-declaration \ + -Wno-format-security \ + -std=gnu89 +VDSO_CFLAGS += -O2 +# Some useful compiler-dependent flags from top-level Makefile +VDSO_CFLAGS += $(call cc32-option,-Wdeclaration-after-statement,) +VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) +VDSO_CFLAGS += $(call cc32-option,-fno-strict-overflow) +VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) +VDSO_CFLAGS += $(call cc32-option,-Werror=date-time) +VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) + +# The 32-bit compiler does not provide 128-bit integers, which are used in +# some headers that are indirectly included from the vDSO code. +# This hack makes the compiler happy and should trigger a warning/error if +# variables of such type are referenced. +VDSO_CFLAGS += -D__uint128_t='void*' +# Silence some warnings coming from headers that operate on long's +# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) +VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) +VDSO_CFLAGS += -Wno-int-to-pointer-cast + +VDSO_AFLAGS := $(VDSO_CAFLAGS) +VDSO_AFLAGS += -D__ASSEMBLY__ + +VDSO_LDFLAGS := $(VDSO_CPPFLAGS) +# From arm vDSO Makefile +VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 +VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 +VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft +VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--hash-style=sysv) +VDSO_LDFLAGS += $(call cc32-ldoption,-Wl$(comma)--build-id) +VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd) + + +# Borrow vdsomunge.c from the arm vDSO +# We have to use a relative path because scripts/Makefile.host prefixes +# $(hostprogs-y) with $(obj) +munge := ../../../arm/vdso/vdsomunge +hostprogs-y := $(munge) + +c-obj-vdso := note.o +c-obj-vdso-gettimeofday := vgettimeofday.o +asm-obj-vdso := sigreturn.o + +ifneq ($(c-gettimeofday-y),) +VDSO_CFLAGS_gettimeofday_o += -include $(c-gettimeofday-y) +endif + +VDSO_CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os + +# Build rules +targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso.so.dbg vdso.so.raw +c-obj-vdso := $(addprefix $(obj)/, $(c-obj-vdso)) +c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) +asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) +obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) + +obj-y += vdso.o +extra-y += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +# Force dependency (vdso.s includes vdso.so through incbin) +$(obj)/vdso.o: $(obj)/vdso.so + +include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE + $(call if_changed,vdsosym) + +# Strip rule for vdso.so +$(obj)/vdso.so: OBJCOPYFLAGS := -S +$(obj)/vdso.so: $(obj)/vdso.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE + $(call if_changed,vdsomunge) + +# Link rule for the .so file, .lds has to be first +$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,vdsold) + $(call if_changed,vdso_check) + +# Compilation rules for the vDSO sources +$(c-obj-vdso): %.o: %.c FORCE + $(call if_changed_dep,vdsocc) +$(c-obj-vdso-gettimeofday): %.o: %.c FORCE + $(call if_changed_dep,vdsocc_gettimeofday) +$(asm-obj-vdso): %.o: %.S FORCE + $(call if_changed_dep,vdsoas) + +# Actual build commands +quiet_cmd_vdsold = VDSOL $@ + cmd_vdsold = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_LDFLAGS) \ + -Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@ +quiet_cmd_vdsocc = VDSOC $@ + cmd_vdsocc = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $< +quiet_cmd_vdsocc_gettimeofday = VDSOC_GTD $@ + cmd_vdsocc_gettimeofday = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) $(VDSO_CFLAGS_gettimeofday_o) -c -o $@ $< +quiet_cmd_vdsoas = VDSOA $@ + cmd_vdsoas = $(COMPATCC) -Wp,-MD,$(depfile) $(VDSO_AFLAGS) -c -o $@ $< + +quiet_cmd_vdsomunge = MUNGE $@ + cmd_vdsomunge = $(obj)/$(munge) $< $@ + +# Generate vDSO offsets using helper script (borrowed from the 64-bit vDSO) +gen-vdsosym := $(srctree)/$(src)/../vdso/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ +# The AArch64 nm should be able to read an AArch32 binary + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +# Install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/vdso32.so + +vdso.so: $(obj)/vdso.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso.so diff --git a/arch/arm64/kernel/vdso32/note.c b/arch/arm64/kernel/vdso32/note.c new file mode 100644 index 000000000000..eff5bf9efb8b --- /dev/null +++ b/arch/arm64/kernel/vdso32/note.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012-2018 ARM Limited + * + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> +#include <linux/build-salt.h> + +ELFNOTE32("Linux", 0, LINUX_VERSION_CODE); +BUILD_SALT; diff --git a/arch/arm64/kernel/vdso32/sigreturn.S b/arch/arm64/kernel/vdso32/sigreturn.S new file mode 100644 index 000000000000..1a81277c2d09 --- /dev/null +++ b/arch/arm64/kernel/vdso32/sigreturn.S @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file provides both A32 and T32 versions, in accordance with the + * arm sigreturn code. + * + * Copyright (C) 2018 ARM Limited + */ + +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> + +#define ARM_ENTRY(name) \ + ENTRY(name) + +#define ARM_ENDPROC(name) \ + .type name, %function; \ + END(name) + + .text + + .arm + .fnstart + .save {r0-r15} + .pad #COMPAT_SIGFRAME_REGS_OFFSET + nop +ARM_ENTRY(__kernel_sigreturn_arm) + mov r7, #__NR_compat_sigreturn + svc #0 + .fnend +ARM_ENDPROC(__kernel_sigreturn_arm) + + .fnstart + .save {r0-r15} + .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET + nop +ARM_ENTRY(__kernel_rt_sigreturn_arm) + mov r7, #__NR_compat_rt_sigreturn + svc #0 + .fnend +ARM_ENDPROC(__kernel_rt_sigreturn_arm) + + .thumb + .fnstart + .save {r0-r15} + .pad #COMPAT_SIGFRAME_REGS_OFFSET + nop +ARM_ENTRY(__kernel_sigreturn_thumb) + mov r7, #__NR_compat_sigreturn + svc #0 + .fnend +ARM_ENDPROC(__kernel_sigreturn_thumb) + + .fnstart + .save {r0-r15} + .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET + nop +ARM_ENTRY(__kernel_rt_sigreturn_thumb) + mov r7, #__NR_compat_rt_sigreturn + svc #0 + .fnend +ARM_ENDPROC(__kernel_rt_sigreturn_thumb) diff --git a/arch/arm64/kernel/vdso32/vdso.S b/arch/arm64/kernel/vdso32/vdso.S new file mode 100644 index 000000000000..e72ac7bc4c04 --- /dev/null +++ b/arch/arm64/kernel/vdso32/vdso.S @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 ARM Limited + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/page.h> + + .globl vdso32_start, vdso32_end + .section .rodata + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/arm64/kernel/vdso32/vdso.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S new file mode 100644 index 000000000000..a3944927eaeb --- /dev/null +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Adapted from arm64 version. + * + * GNU linker script for the VDSO library. + * Heavily based on the vDSO linker scripts for other archs. + * + * Copyright (C) 2012-2018 ARM Limited + */ + +#include <linux/const.h> +#include <asm/page.h> +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") +OUTPUT_ARCH(arm) + +SECTIONS +{ + PROVIDE_HIDDEN(_vdso_data = . - PAGE_SIZE); + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + + .text : { *(.text*) } :text =0xe7f001f2 + + .got : { *(.got) } + .rel.plt : { *(.rel.plt) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ +} + +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_clock_getres; + __kernel_sigreturn_arm; + __kernel_sigreturn_thumb; + __kernel_rt_sigreturn_arm; + __kernel_rt_sigreturn_thumb; + __vdso_clock_gettime64; + local: *; + }; +} + +/* + * Make the sigreturn code visible to the kernel. + */ +VDSO_compat_sigreturn_arm = __kernel_sigreturn_arm; +VDSO_compat_sigreturn_thumb = __kernel_sigreturn_thumb; +VDSO_compat_rt_sigreturn_arm = __kernel_rt_sigreturn_arm; +VDSO_compat_rt_sigreturn_thumb = __kernel_rt_sigreturn_thumb; diff --git a/arch/arm64/kernel/vdso32/vgettimeofday.c b/arch/arm64/kernel/vdso32/vgettimeofday.c new file mode 100644 index 000000000000..54fc1c2ce93f --- /dev/null +++ b/arch/arm64/kernel/vdso32/vgettimeofday.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 compat userspace implementations of gettimeofday() and similar. + * + * Copyright (C) 2018 ARM Limited + * + */ +#include <linux/time.h> +#include <linux/types.h> + +int __vdso_clock_gettime(clockid_t clock, + struct old_timespec32 *ts) +{ + /* The checks below are required for ABI consistency with arm */ + if ((u32)ts >= TASK_SIZE_32) + return -EFAULT; + + return __cvdso_clock_gettime32(clock, ts); +} + +int __vdso_clock_gettime64(clockid_t clock, + struct __kernel_timespec *ts) +{ + /* The checks below are required for ABI consistency with arm */ + if ((u32)ts >= TASK_SIZE_32) + return -EFAULT; + + return __cvdso_clock_gettime(clock, ts); +} + +int __vdso_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __vdso_clock_getres(clockid_t clock_id, + struct old_timespec32 *res) +{ + /* The checks below are required for ABI consistency with arm */ + if ((u32)res >= TASK_SIZE_32) + return -EFAULT; + + return __cvdso_clock_getres_time32(clock_id, res); +} + +/* Avoid unresolved references emitted by GCC */ + +void __aeabi_unwind_cpp_pr0(void) +{ +} + +void __aeabi_unwind_cpp_pr1(void) +{ +} + +void __aeabi_unwind_cpp_pr2(void) +{ +} diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 6e3c9c8b2df9..525010504f9d 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -112,9 +112,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1]; - /* Clean guest FP state to memory and invalidate cpu view */ - fpsimd_save(); - fpsimd_flush_cpu_state(); + fpsimd_save_and_flush_cpu_state(); if (guest_has_sve) *guest_zcr = read_sysreg_s(SYS_ZCR_EL12); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index c2afa7982047..dfd626447482 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -208,7 +208,7 @@ out: #define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) #define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) -#define vq_present(vqs, vq) ((vqs)[vq_word(vq)] & vq_mask(vq)) +#define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq))) static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index bd34016354ba..e5cc8d66bf53 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -6,6 +6,7 @@ #include <linux/linkage.h> +#include <asm/alternative.h> #include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/fpsimdmacros.h> @@ -52,6 +53,20 @@ ENTRY(__guest_enter) // Store the host regs save_callee_saved_regs x1 + // Now the host state is stored if we have a pending RAS SError it must + // affect the host. If any asynchronous exception is pending we defer + // the guest entry. The DSB isn't necessary before v8.2 as any SError + // would be fatal. +alternative_if ARM64_HAS_RAS_EXTN + dsb nshst + isb +alternative_else_nop_endif + mrs x1, isr_el1 + cbz x1, 1f + mov x0, #ARM_EXCEPTION_IRQ + ret + +1: add x18, x0, #VCPU_CONTEXT // Macro ptrauth_switch_to_guest format: @@ -127,8 +142,8 @@ ENTRY(__guest_exit) alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error - // without an unmask-SError and isb. - esb + // without an unmask-SError and isb. The ESB-instruction consumed any + // pending guest error when we took the exception from the guest. mrs_s x2, SYS_DISR_EL1 str x2, [x1, #(VCPU_FAULT_DISR - VCPU_CONTEXT)] cbz x2, 1f @@ -136,8 +151,16 @@ alternative_if ARM64_HAS_RAS_EXTN orr x0, x0, #(1<<ARM_EXIT_WITH_SERROR_BIT) 1: ret alternative_else - // If we have a pending asynchronous abort, now is the - // time to find out. From your VAXorcist book, page 666: + dsb sy // Synchronize against in-flight ld/st + isb // Prevent an early read of side-effect free ISR + mrs x2, isr_el1 + tbnz x2, #8, 2f // ISR_EL1.A + ret + nop +2: +alternative_endif + // We know we have a pending asynchronous abort, now is the + // time to flush it out. From your VAXorcist book, page 666: // "Threaten me not, oh Evil one! For I speak with // the power of DEC, and I command thee to show thyself!" mrs x2, elr_el2 @@ -145,10 +168,7 @@ alternative_else mrs x4, spsr_el2 mov x5, x0 - dsb sy // Synchronize against in-flight ld/st - nop msr daifclr, #4 // Unmask aborts -alternative_endif // This is our single instruction exception window. A pending // SError is guaranteed to occur at the earliest when we unmask @@ -161,6 +181,8 @@ abort_guest_exit_start: .global abort_guest_exit_end abort_guest_exit_end: + msr daifset, #4 // Mask aborts + // If the exception took place, restore the EL1 exception // context so that we can report some information. // Merge the exception code with the SError pending bit. diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index b8e045615961..ffa68d5713f1 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -216,17 +216,34 @@ ENDPROC(\label) .align 11 +.macro check_preamble_length start, end +/* kvm_patch_vector_branch() generates code that jumps over the preamble. */ +.if ((\end-\start) != KVM_VECTOR_PREAMBLE) + .error "KVM vector preamble length mismatch" +.endif +.endm + .macro valid_vect target .align 7 +661: + esb stp x0, x1, [sp, #-16]! +662: b \target + +check_preamble_length 661b, 662b .endm .macro invalid_vect target .align 7 +661: b \target + nop +662: ldp x0, x1, [sp], #16 b \target + +check_preamble_length 661b, 662b .endm ENTRY(__kvm_hyp_vector) @@ -254,13 +271,14 @@ ENDPROC(__kvm_hyp_vector) #ifdef CONFIG_KVM_INDIRECT_VECTORS .macro hyp_ventry .align 7 -1: .rept 27 +1: esb + .rept 26 nop .endr /* * The default sequence is to directly branch to the KVM vectors, * using the computed offset. This applies for VHE as well as - * !ARM64_HARDEN_EL2_VECTORS. + * !ARM64_HARDEN_EL2_VECTORS. The first vector must always run the preamble. * * For ARM64_HARDEN_EL2_VECTORS configurations, this gets replaced * with: @@ -271,12 +289,13 @@ ENDPROC(__kvm_hyp_vector) * movk x0, #((addr >> 32) & 0xffff), lsl #32 * br x0 * - * Where addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + 4. + * Where: + * addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + KVM_VECTOR_PREAMBLE. * See kvm_patch_vector_branch for details. */ alternative_cb kvm_patch_vector_branch - b __kvm_hyp_vector + (1b - 0b) - nop + stp x0, x1, [sp, #-16]! + b __kvm_hyp_vector + (1b - 0b + KVM_VECTOR_PREAMBLE) nop nop nop @@ -301,6 +320,7 @@ ENTRY(__bp_harden_hyp_vecs_end) .popsection ENTRY(__smccc_workaround_1_smc_start) + esb sub sp, sp, #(8 * 4) stp x2, x3, [sp, #(8 * 0)] stp x0, x1, [sp, #(8 * 2)] diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index b0041812bca9..adaf266d8de8 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -284,7 +284,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) return true; - far = read_sysreg_el2(far); + far = read_sysreg_el2(SYS_FAR); /* * The HPFAR can be invalid if the stage 2 fault did not @@ -401,7 +401,7 @@ static bool __hyp_text __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) - vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr); + vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); /* * We're using the raw exception code in order to only process @@ -604,7 +604,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) * Naturally, we want to avoid this. */ if (system_uses_irq_prio_masking()) { - gic_write_pmr(GIC_PRIO_IRQON); + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); dsb(sy); } @@ -697,8 +697,8 @@ static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par, asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va)); __hyp_do_panic(str_va, - spsr, elr, - read_sysreg(esr_el2), read_sysreg_el2(far), + spsr, elr, + read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR), read_sysreg(hpfar_el2), par, vcpu); } @@ -713,15 +713,15 @@ static void __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par, panic(__hyp_panic_string, spsr, elr, - read_sysreg_el2(esr), read_sysreg_el2(far), + read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR), read_sysreg(hpfar_el2), par, vcpu); } NOKPROBE_SYMBOL(__hyp_call_panic_vhe); void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) { - u64 spsr = read_sysreg_el2(spsr); - u64 elr = read_sysreg_el2(elr); + u64 spsr = read_sysreg_el2(SYS_SPSR); + u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); if (!has_vhe()) diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index c283f7cbc702..7ddbc849b580 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -43,33 +43,33 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); - ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr); + ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(SYS_SCTLR); ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); - ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(cpacr); - ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(ttbr0); - ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(ttbr1); - ctxt->sys_regs[TCR_EL1] = read_sysreg_el1(tcr); - ctxt->sys_regs[ESR_EL1] = read_sysreg_el1(esr); - ctxt->sys_regs[AFSR0_EL1] = read_sysreg_el1(afsr0); - ctxt->sys_regs[AFSR1_EL1] = read_sysreg_el1(afsr1); - ctxt->sys_regs[FAR_EL1] = read_sysreg_el1(far); - ctxt->sys_regs[MAIR_EL1] = read_sysreg_el1(mair); - ctxt->sys_regs[VBAR_EL1] = read_sysreg_el1(vbar); - ctxt->sys_regs[CONTEXTIDR_EL1] = read_sysreg_el1(contextidr); - ctxt->sys_regs[AMAIR_EL1] = read_sysreg_el1(amair); - ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg_el1(cntkctl); + ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(SYS_CPACR); + ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(SYS_TTBR0); + ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(SYS_TTBR1); + ctxt->sys_regs[TCR_EL1] = read_sysreg_el1(SYS_TCR); + ctxt->sys_regs[ESR_EL1] = read_sysreg_el1(SYS_ESR); + ctxt->sys_regs[AFSR0_EL1] = read_sysreg_el1(SYS_AFSR0); + ctxt->sys_regs[AFSR1_EL1] = read_sysreg_el1(SYS_AFSR1); + ctxt->sys_regs[FAR_EL1] = read_sysreg_el1(SYS_FAR); + ctxt->sys_regs[MAIR_EL1] = read_sysreg_el1(SYS_MAIR); + ctxt->sys_regs[VBAR_EL1] = read_sysreg_el1(SYS_VBAR); + ctxt->sys_regs[CONTEXTIDR_EL1] = read_sysreg_el1(SYS_CONTEXTIDR); + ctxt->sys_regs[AMAIR_EL1] = read_sysreg_el1(SYS_AMAIR); + ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg_el1(SYS_CNTKCTL); ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1); ctxt->sys_regs[TPIDR_EL1] = read_sysreg(tpidr_el1); ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1); - ctxt->gp_regs.elr_el1 = read_sysreg_el1(elr); - ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr); + ctxt->gp_regs.elr_el1 = read_sysreg_el1(SYS_ELR); + ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR); } static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) { - ctxt->gp_regs.regs.pc = read_sysreg_el2(elr); - ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr); + ctxt->gp_regs.regs.pc = read_sysreg_el2(SYS_ELR); + ctxt->gp_regs.regs.pstate = read_sysreg_el2(SYS_SPSR); if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) ctxt->sys_regs[DISR_EL1] = read_sysreg_s(SYS_VDISR_EL2); @@ -109,35 +109,35 @@ static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctx static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) { - write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0); - write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); + write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0); + write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0); } static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); - write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], sctlr); - write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); - write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], cpacr); - write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1], ttbr0); - write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1], ttbr1); - write_sysreg_el1(ctxt->sys_regs[TCR_EL1], tcr); - write_sysreg_el1(ctxt->sys_regs[ESR_EL1], esr); - write_sysreg_el1(ctxt->sys_regs[AFSR0_EL1], afsr0); - write_sysreg_el1(ctxt->sys_regs[AFSR1_EL1], afsr1); - write_sysreg_el1(ctxt->sys_regs[FAR_EL1], far); - write_sysreg_el1(ctxt->sys_regs[MAIR_EL1], mair); - write_sysreg_el1(ctxt->sys_regs[VBAR_EL1], vbar); - write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL1],contextidr); - write_sysreg_el1(ctxt->sys_regs[AMAIR_EL1], amair); - write_sysreg_el1(ctxt->sys_regs[CNTKCTL_EL1], cntkctl); + write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR); + write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1); + write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], SYS_CPACR); + write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1], SYS_TTBR0); + write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1], SYS_TTBR1); + write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR); + write_sysreg_el1(ctxt->sys_regs[ESR_EL1], SYS_ESR); + write_sysreg_el1(ctxt->sys_regs[AFSR0_EL1], SYS_AFSR0); + write_sysreg_el1(ctxt->sys_regs[AFSR1_EL1], SYS_AFSR1); + write_sysreg_el1(ctxt->sys_regs[FAR_EL1], SYS_FAR); + write_sysreg_el1(ctxt->sys_regs[MAIR_EL1], SYS_MAIR); + write_sysreg_el1(ctxt->sys_regs[VBAR_EL1], SYS_VBAR); + write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL1],SYS_CONTEXTIDR); + write_sysreg_el1(ctxt->sys_regs[AMAIR_EL1], SYS_AMAIR); + write_sysreg_el1(ctxt->sys_regs[CNTKCTL_EL1], SYS_CNTKCTL); write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1); write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1); write_sysreg(ctxt->gp_regs.sp_el1, sp_el1); - write_sysreg_el1(ctxt->gp_regs.elr_el1, elr); - write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr); + write_sysreg_el1(ctxt->gp_regs.elr_el1, SYS_ELR); + write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],SYS_SPSR); } static void __hyp_text @@ -160,8 +160,8 @@ __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) pstate = PSR_MODE_EL2h | PSR_IL_BIT; - write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); - write_sysreg_el2(pstate, spsr); + write_sysreg_el2(ctxt->gp_regs.regs.pc, SYS_ELR); + write_sysreg_el2(pstate, SYS_SPSR); if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2); diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 32078b767f63..d49a14497715 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -33,12 +33,12 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm, * in the TCR_EL1 register. We also need to prevent it to * allocate IPA->PA walks, so we enable the S1 MMU... */ - val = cxt->tcr = read_sysreg_el1(tcr); + val = cxt->tcr = read_sysreg_el1(SYS_TCR); val |= TCR_EPD1_MASK | TCR_EPD0_MASK; - write_sysreg_el1(val, tcr); - val = cxt->sctlr = read_sysreg_el1(sctlr); + write_sysreg_el1(val, SYS_TCR); + val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR); val |= SCTLR_ELx_M; - write_sysreg_el1(val, sctlr); + write_sysreg_el1(val, SYS_SCTLR); } /* @@ -85,8 +85,8 @@ static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm, if (cpus_have_const_cap(ARM64_WORKAROUND_1165522)) { /* Restore the registers to what they were */ - write_sysreg_el1(cxt->tcr, tcr); - write_sysreg_el1(cxt->sctlr, sctlr); + write_sysreg_el1(cxt->tcr, SYS_TCR); + write_sysreg_el1(cxt->sctlr, SYS_SCTLR); } local_irq_restore(cxt->flags); diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index ba2aaeb84c6c..29ee1feba4eb 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -16,7 +16,7 @@ static bool __hyp_text __is_be(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) - return !!(read_sysreg_el2(spsr) & PSR_AA32_E_BIT); + return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT); return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE); } diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c index d66613e6ad08..0d60e4f0af66 100644 --- a/arch/arm64/kvm/regmap.c +++ b/arch/arm64/kvm/regmap.c @@ -152,7 +152,7 @@ unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu) switch (spsr_idx) { case KVM_SPSR_SVC: - return read_sysreg_el1(spsr); + return read_sysreg_el1(SYS_SPSR); case KVM_SPSR_ABT: return read_sysreg(spsr_abt); case KVM_SPSR_UND: @@ -177,7 +177,7 @@ void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v) switch (spsr_idx) { case KVM_SPSR_SVC: - write_sysreg_el1(v, spsr); + write_sysreg_el1(v, SYS_SPSR); case KVM_SPSR_ABT: write_sysreg(v, spsr_abt); case KVM_SPSR_UND: diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ce933f296049..f26e181d881c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -81,24 +81,24 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) */ switch (reg) { case CSSELR_EL1: return read_sysreg_s(SYS_CSSELR_EL1); - case SCTLR_EL1: return read_sysreg_s(sctlr_EL12); + case SCTLR_EL1: return read_sysreg_s(SYS_SCTLR_EL12); case ACTLR_EL1: return read_sysreg_s(SYS_ACTLR_EL1); - case CPACR_EL1: return read_sysreg_s(cpacr_EL12); - case TTBR0_EL1: return read_sysreg_s(ttbr0_EL12); - case TTBR1_EL1: return read_sysreg_s(ttbr1_EL12); - case TCR_EL1: return read_sysreg_s(tcr_EL12); - case ESR_EL1: return read_sysreg_s(esr_EL12); - case AFSR0_EL1: return read_sysreg_s(afsr0_EL12); - case AFSR1_EL1: return read_sysreg_s(afsr1_EL12); - case FAR_EL1: return read_sysreg_s(far_EL12); - case MAIR_EL1: return read_sysreg_s(mair_EL12); - case VBAR_EL1: return read_sysreg_s(vbar_EL12); - case CONTEXTIDR_EL1: return read_sysreg_s(contextidr_EL12); + case CPACR_EL1: return read_sysreg_s(SYS_CPACR_EL12); + case TTBR0_EL1: return read_sysreg_s(SYS_TTBR0_EL12); + case TTBR1_EL1: return read_sysreg_s(SYS_TTBR1_EL12); + case TCR_EL1: return read_sysreg_s(SYS_TCR_EL12); + case ESR_EL1: return read_sysreg_s(SYS_ESR_EL12); + case AFSR0_EL1: return read_sysreg_s(SYS_AFSR0_EL12); + case AFSR1_EL1: return read_sysreg_s(SYS_AFSR1_EL12); + case FAR_EL1: return read_sysreg_s(SYS_FAR_EL12); + case MAIR_EL1: return read_sysreg_s(SYS_MAIR_EL12); + case VBAR_EL1: return read_sysreg_s(SYS_VBAR_EL12); + case CONTEXTIDR_EL1: return read_sysreg_s(SYS_CONTEXTIDR_EL12); case TPIDR_EL0: return read_sysreg_s(SYS_TPIDR_EL0); case TPIDRRO_EL0: return read_sysreg_s(SYS_TPIDRRO_EL0); case TPIDR_EL1: return read_sysreg_s(SYS_TPIDR_EL1); - case AMAIR_EL1: return read_sysreg_s(amair_EL12); - case CNTKCTL_EL1: return read_sysreg_s(cntkctl_EL12); + case AMAIR_EL1: return read_sysreg_s(SYS_AMAIR_EL12); + case CNTKCTL_EL1: return read_sysreg_s(SYS_CNTKCTL_EL12); case PAR_EL1: return read_sysreg_s(SYS_PAR_EL1); case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2); case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2); @@ -124,24 +124,24 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) */ switch (reg) { case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return; - case SCTLR_EL1: write_sysreg_s(val, sctlr_EL12); return; + case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); return; case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); return; - case CPACR_EL1: write_sysreg_s(val, cpacr_EL12); return; - case TTBR0_EL1: write_sysreg_s(val, ttbr0_EL12); return; - case TTBR1_EL1: write_sysreg_s(val, ttbr1_EL12); return; - case TCR_EL1: write_sysreg_s(val, tcr_EL12); return; - case ESR_EL1: write_sysreg_s(val, esr_EL12); return; - case AFSR0_EL1: write_sysreg_s(val, afsr0_EL12); return; - case AFSR1_EL1: write_sysreg_s(val, afsr1_EL12); return; - case FAR_EL1: write_sysreg_s(val, far_EL12); return; - case MAIR_EL1: write_sysreg_s(val, mair_EL12); return; - case VBAR_EL1: write_sysreg_s(val, vbar_EL12); return; - case CONTEXTIDR_EL1: write_sysreg_s(val, contextidr_EL12); return; + case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); return; + case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); return; + case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); return; + case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); return; + case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); return; + case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); return; + case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); return; + case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); return; + case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); return; + case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); return; + case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12); return; case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); return; case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); return; case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); return; - case AMAIR_EL1: write_sysreg_s(val, amair_EL12); return; - case CNTKCTL_EL1: write_sysreg_s(val, cntkctl_EL12); return; + case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); return; + case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); return; case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); return; case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return; case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return; @@ -865,12 +865,12 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->Op2 & 0x1) { /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; - kvm_pmu_enable_counter(vcpu, val); + kvm_pmu_enable_counter_mask(vcpu, val); kvm_vcpu_pmu_restore_guest(vcpu); } else { /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; - kvm_pmu_disable_counter(vcpu, val); + kvm_pmu_disable_counter_mask(vcpu, val); } } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask; diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 2947ab1b0fa5..acd8084f1f2c 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -170,11 +170,10 @@ void kvm_patch_vector_branch(struct alt_instr *alt, addr |= ((u64)origptr & GENMASK_ULL(10, 7)); /* - * Branch to the second instruction in the vectors in order to - * avoid the initial store on the stack (which we already - * perform in the hardening vectors). + * Branch over the preamble in order to avoid the initial store on + * the stack (which we already perform in the hardening vectors). */ - addr += AARCH64_INSN_SIZE; + addr += KVM_VECTOR_PREAMBLE; /* stp x0, x1, [sp, #-16]! */ insn = aarch64_insn_gen_load_store_pair(AARCH64_INSN_REG_0, diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 5992eb9a9a08..1d3f0b5a9940 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -1,24 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * SWIOTLB-based DMA API implementation - * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> */ #include <linux/gfp.h> -#include <linux/acpi.h> -#include <linux/memblock.h> #include <linux/cache.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/genalloc.h> -#include <linux/dma-direct.h> #include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> -#include <linux/vmalloc.h> -#include <linux/swiotlb.h> -#include <linux/pci.h> +#include <linux/dma-iommu.h> #include <asm/cacheflush.h> @@ -47,422 +36,33 @@ void arch_dma_prep_coherent(struct page *page, size_t size) __dma_flush_area(page_address(page), size); } -#ifdef CONFIG_IOMMU_DMA -static int __swiotlb_get_sgtable_page(struct sg_table *sgt, - struct page *page, size_t size) -{ - int ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - - if (!ret) - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - - return ret; -} - -static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, - unsigned long pfn, size_t size) -{ - int ret = -ENXIO; - unsigned long nr_vma_pages = vma_pages(vma); - unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; - - if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) { - ret = remap_pfn_range(vma, vma->vm_start, - pfn + off, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); - } - - return ret; -} -#endif /* CONFIG_IOMMU_DMA */ - static int __init arm64_dma_init(void) { - WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(), - TAINT_CPU_OUT_OF_SPEC, - "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", - ARCH_DMA_MINALIGN, cache_line_size()); return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC)); } arch_initcall(arm64_dma_init); #ifdef CONFIG_IOMMU_DMA -#include <linux/dma-iommu.h> -#include <linux/platform_device.h> -#include <linux/amba/bus.h> - -/* Thankfully, all cache ops are by VA so we can ignore phys here */ -static void flush_page(struct device *dev, const void *virt, phys_addr_t phys) -{ - __dma_flush_area(virt, PAGE_SIZE); -} - -static void *__iommu_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp, - unsigned long attrs) -{ - bool coherent = dev_is_dma_coherent(dev); - int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); - size_t iosize = size; - void *addr; - - if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n")) - return NULL; - - size = PAGE_ALIGN(size); - - /* - * Some drivers rely on this, and we probably don't want the - * possibility of stale kernel data being read by devices anyway. - */ - gfp |= __GFP_ZERO; - - if (!gfpflags_allow_blocking(gfp)) { - struct page *page; - /* - * In atomic context we can't remap anything, so we'll only - * get the virtually contiguous buffer we need by way of a - * physically contiguous allocation. - */ - if (coherent) { - page = alloc_pages(gfp, get_order(size)); - addr = page ? page_address(page) : NULL; - } else { - addr = dma_alloc_from_pool(size, &page, gfp); - } - if (!addr) - return NULL; - - *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot); - if (*handle == DMA_MAPPING_ERROR) { - if (coherent) - __free_pages(page, get_order(size)); - else - dma_free_from_pool(addr, size); - addr = NULL; - } - } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { - pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs); - struct page *page; - - page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), gfp & __GFP_NOWARN); - if (!page) - return NULL; - - *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot); - if (*handle == DMA_MAPPING_ERROR) { - dma_release_from_contiguous(dev, page, - size >> PAGE_SHIFT); - return NULL; - } - addr = dma_common_contiguous_remap(page, size, VM_USERMAP, - prot, - __builtin_return_address(0)); - if (addr) { - if (!coherent) - __dma_flush_area(page_to_virt(page), iosize); - memset(addr, 0, size); - } else { - iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs); - dma_release_from_contiguous(dev, page, - size >> PAGE_SHIFT); - } - } else { - pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs); - struct page **pages; - - pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot, - handle, flush_page); - if (!pages) - return NULL; - - addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot, - __builtin_return_address(0)); - if (!addr) - iommu_dma_free(dev, pages, iosize, handle); - } - return addr; -} - -static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t handle, unsigned long attrs) -{ - size_t iosize = size; - - size = PAGE_ALIGN(size); - /* - * @cpu_addr will be one of 4 things depending on how it was allocated: - * - A remapped array of pages for contiguous allocations. - * - A remapped array of pages from iommu_dma_alloc(), for all - * non-atomic allocations. - * - A non-cacheable alias from the atomic pool, for atomic - * allocations by non-coherent devices. - * - A normal lowmem address, for atomic allocations by - * coherent devices. - * Hence how dodgy the below logic looks... - */ - if (dma_in_atomic_pool(cpu_addr, size)) { - iommu_dma_unmap_page(dev, handle, iosize, 0, 0); - dma_free_from_pool(cpu_addr, size); - } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { - struct page *page = vmalloc_to_page(cpu_addr); - - iommu_dma_unmap_page(dev, handle, iosize, 0, attrs); - dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); - dma_common_free_remap(cpu_addr, size, VM_USERMAP); - } else if (is_vmalloc_addr(cpu_addr)){ - struct vm_struct *area = find_vm_area(cpu_addr); - - if (WARN_ON(!area || !area->pages)) - return; - iommu_dma_free(dev, area->pages, iosize, &handle); - dma_common_free_remap(cpu_addr, size, VM_USERMAP); - } else { - iommu_dma_unmap_page(dev, handle, iosize, 0, 0); - __free_pages(virt_to_page(cpu_addr), get_order(size)); - } -} - -static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - struct vm_struct *area; - int ret; - - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); - - if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - - if (!is_vmalloc_addr(cpu_addr)) { - unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr)); - return __swiotlb_mmap_pfn(vma, pfn, size); - } - - if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { - /* - * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped, - * hence in the vmalloc space. - */ - unsigned long pfn = vmalloc_to_pfn(cpu_addr); - return __swiotlb_mmap_pfn(vma, pfn, size); - } - - area = find_vm_area(cpu_addr); - if (WARN_ON(!area || !area->pages)) - return -ENXIO; - - return iommu_dma_mmap(area->pages, size, vma); -} - -static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size, unsigned long attrs) -{ - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - struct vm_struct *area = find_vm_area(cpu_addr); - - if (!is_vmalloc_addr(cpu_addr)) { - struct page *page = virt_to_page(cpu_addr); - return __swiotlb_get_sgtable_page(sgt, page, size); - } - - if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { - /* - * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped, - * hence in the vmalloc space. - */ - struct page *page = vmalloc_to_page(cpu_addr); - return __swiotlb_get_sgtable_page(sgt, page, size); - } - - if (WARN_ON(!area || !area->pages)) - return -ENXIO; - - return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size, - GFP_KERNEL); -} - -static void __iommu_sync_single_for_cpu(struct device *dev, - dma_addr_t dev_addr, size_t size, - enum dma_data_direction dir) -{ - phys_addr_t phys; - - if (dev_is_dma_coherent(dev)) - return; - - phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dev_addr); - arch_sync_dma_for_cpu(dev, phys, size, dir); -} - -static void __iommu_sync_single_for_device(struct device *dev, - dma_addr_t dev_addr, size_t size, - enum dma_data_direction dir) -{ - phys_addr_t phys; - - if (dev_is_dma_coherent(dev)) - return; - - phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dev_addr); - arch_sync_dma_for_device(dev, phys, size, dir); -} - -static dma_addr_t __iommu_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - bool coherent = dev_is_dma_coherent(dev); - int prot = dma_info_to_prot(dir, coherent, attrs); - dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot); - - if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - dev_addr != DMA_MAPPING_ERROR) - __dma_map_area(page_address(page) + offset, size, dir); - - return dev_addr; -} - -static void __iommu_unmap_page(struct device *dev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) - __iommu_sync_single_for_cpu(dev, dev_addr, size, dir); - - iommu_dma_unmap_page(dev, dev_addr, size, dir, attrs); -} - -static void __iommu_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - if (dev_is_dma_coherent(dev)) - return; - - for_each_sg(sgl, sg, nelems, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); -} - -static void __iommu_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - if (dev_is_dma_coherent(dev)) - return; - - for_each_sg(sgl, sg, nelems, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); -} - -static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - bool coherent = dev_is_dma_coherent(dev); - - if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) - __iommu_sync_sg_for_device(dev, sgl, nelems, dir); - - return iommu_dma_map_sg(dev, sgl, nelems, - dma_info_to_prot(dir, coherent, attrs)); -} - -static void __iommu_unmap_sg_attrs(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, - unsigned long attrs) -{ - if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) - __iommu_sync_sg_for_cpu(dev, sgl, nelems, dir); - - iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs); -} - -static const struct dma_map_ops iommu_dma_ops = { - .alloc = __iommu_alloc_attrs, - .free = __iommu_free_attrs, - .mmap = __iommu_mmap_attrs, - .get_sgtable = __iommu_get_sgtable, - .map_page = __iommu_map_page, - .unmap_page = __iommu_unmap_page, - .map_sg = __iommu_map_sg_attrs, - .unmap_sg = __iommu_unmap_sg_attrs, - .sync_single_for_cpu = __iommu_sync_single_for_cpu, - .sync_single_for_device = __iommu_sync_single_for_device, - .sync_sg_for_cpu = __iommu_sync_sg_for_cpu, - .sync_sg_for_device = __iommu_sync_sg_for_device, - .map_resource = iommu_dma_map_resource, - .unmap_resource = iommu_dma_unmap_resource, -}; - -static int __init __iommu_dma_init(void) -{ - return iommu_dma_init(); -} -arch_initcall(__iommu_dma_init); - -static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *ops) -{ - struct iommu_domain *domain; - - if (!ops) - return; - - /* - * The IOMMU core code allocates the default DMA domain, which the - * underlying IOMMU driver needs to support via the dma-iommu layer. - */ - domain = iommu_get_domain_for_dev(dev); - - if (!domain) - goto out_err; - - if (domain->type == IOMMU_DOMAIN_DMA) { - if (iommu_dma_init_domain(domain, dma_base, size, dev)) - goto out_err; - - dev->dma_ops = &iommu_dma_ops; - } - - return; - -out_err: - pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n", - dev_name(dev)); -} - void arch_teardown_dma_ops(struct device *dev) { dev->dma_ops = NULL; } - -#else - -static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu) -{ } - -#endif /* CONFIG_IOMMU_DMA */ +#endif void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { + int cls = cache_line_size_of_cpu(); + + WARN_TAINT(!coherent && cls > ARCH_DMA_MINALIGN, + TAINT_CPU_OUT_OF_SPEC, + "%s %s: ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", + dev_driver_string(dev), dev_name(dev), + ARCH_DMA_MINALIGN, cls); + dev->dma_coherent = coherent; - __iommu_setup_dma_ops(dev, dma_base, size, iommu); + if (iommu) + iommu_setup_dma_ops(dev, dma_base, size); #ifdef CONFIG_XEN if (xen_initial_domain()) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2d115016feb4..c8c61b1eb479 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -384,40 +384,31 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADACCESS 0x020000 static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, - unsigned int mm_flags, unsigned long vm_flags, - struct task_struct *tsk) + unsigned int mm_flags, unsigned long vm_flags) { - struct vm_area_struct *vma; - vm_fault_t fault; + struct vm_area_struct *vma = find_vma(mm, addr); - vma = find_vma(mm, addr); - fault = VM_FAULT_BADMAP; if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > addr)) - goto check_stack; + return VM_FAULT_BADMAP; /* * Ok, we have a good vm_area for this memory access, so we can handle * it. */ -good_area: + if (unlikely(vma->vm_start > addr)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + return VM_FAULT_BADMAP; + if (expand_stack(vma, addr)) + return VM_FAULT_BADMAP; + } + /* * Check that the permissions on the VMA allow for the fault which * occurred. */ - if (!(vma->vm_flags & vm_flags)) { - fault = VM_FAULT_BADACCESS; - goto out; - } - + if (!(vma->vm_flags & vm_flags)) + return VM_FAULT_BADACCESS; return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); - -check_stack: - if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) - goto good_area; -out: - return fault; } static bool is_el0_instruction_abort(unsigned int esr) @@ -425,12 +416,20 @@ static bool is_el0_instruction_abort(unsigned int esr) return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; } +/* + * Note: not valid for EL1 DC IVAC, but we never use that such that it + * should fault. EL0 cannot issue DC IVAC (undef). + */ +static bool is_write_abort(unsigned int esr) +{ + return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; - struct task_struct *tsk; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; vm_fault_t fault, major = 0; unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -438,9 +437,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (notify_page_fault(regs, esr)) return 0; - tsk = current; - mm = tsk->mm; - /* * If we're in an interrupt or have no user context, we must not take * the fault. @@ -453,7 +449,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; - } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { + mm_flags |= FAULT_FLAG_INSTRUCTION; + } else if (is_write_abort(esr)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } @@ -492,12 +489,14 @@ retry: */ might_sleep(); #ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && !search_exception_tables(regs->pc)) + if (!user_mode(regs) && !search_exception_tables(regs->pc)) { + up_read(&mm->mmap_sem); goto no_context; + } #endif } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk); + fault = __do_page_fault(mm, addr, mm_flags, vm_flags); major |= fault & VM_FAULT_MAJOR; if (fault & VM_FAULT_RETRY) { @@ -537,11 +536,11 @@ retry: * that point. */ if (major) { - tsk->maj_flt++; + current->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); } else { - tsk->min_flt++; + current->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f475e54fbc43..bbeb6a5a6ba6 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -228,7 +228,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (sz == PUD_SIZE) { ptep = (pte_t *)pudp; - } else if (sz == (PAGE_SIZE * CONT_PTES)) { + } else if (sz == (CONT_PTE_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); @@ -246,7 +246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, ptep = huge_pmd_share(mm, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); - } else if (sz == (PMD_SIZE * CONT_PMDS)) { + } else if (sz == (CONT_PMD_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); return (pte_t *)pmdp; @@ -454,9 +454,9 @@ static int __init hugetlbpage_init(void) #ifdef CONFIG_ARM64_4K_PAGES add_huge_page_size(PUD_SIZE); #endif - add_huge_page_size(PMD_SIZE * CONT_PMDS); + add_huge_page_size(CONT_PMD_SIZE); add_huge_page_size(PMD_SIZE); - add_huge_page_size(PAGE_SIZE * CONT_PTES); + add_huge_page_size(CONT_PTE_SIZE); return 0; } @@ -470,9 +470,9 @@ static __init int setup_hugepagesz(char *opt) #ifdef CONFIG_ARM64_4K_PAGES case PUD_SIZE: #endif - case PMD_SIZE * CONT_PMDS: + case CONT_PMD_SIZE: case PMD_SIZE: - case PAGE_SIZE * CONT_PTES: + case CONT_PTE_SIZE: add_huge_page_size(ps); return 1; } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 749c9b269f08..f3c795278def 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -180,8 +180,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys()); +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys()); +#endif max_zone_pfns[ZONE_NORMAL] = max; free_area_init_nodes(max_zone_pfns); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e5ae8663f230..1b49c08dfa2b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -362,7 +362,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, static phys_addr_t __pgd_pgtable_alloc(int shift) { - void *ptr = (void *)__get_free_page(PGALLOC_GFP); + void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL); BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ @@ -765,7 +765,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } -#endif /* CONFIG_ARM64_64K_PAGES */ +#endif /* !ARM64_SWAPPER_USES_SECTION_MAPS */ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { @@ -960,32 +960,28 @@ int __init arch_ioremap_pmd_supported(void) int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { - pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT | - pgprot_val(mk_sect_prot(prot))); - pud_t new_pud = pfn_pud(__phys_to_pfn(phys), sect_prot); + pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); /* Only allow permission changes for now */ if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)), pud_val(new_pud))) return 0; - BUG_ON(phys & ~PUD_MASK); + VM_BUG_ON(phys & ~PUD_MASK); set_pud(pudp, new_pud); return 1; } int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { - pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT | - pgprot_val(mk_sect_prot(prot))); - pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), sect_prot); + pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); /* Only allow permission changes for now */ if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)), pmd_val(new_pmd))) return 0; - BUG_ON(phys & ~PMD_MASK); + VM_BUG_ON(phys & ~PMD_MASK); set_pmd(pmdp, new_pmd); return 1; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 47b057bfa803..03c53f16ee77 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -19,8 +19,7 @@ struct page_change_data { bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); -static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, - void *data) +static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; pte_t pte = READ_ONCE(*ptep); @@ -151,17 +150,48 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) __pgprot(PTE_VALID)); } -#ifdef CONFIG_DEBUG_PAGEALLOC +int set_direct_map_invalid_noflush(struct page *page) +{ + struct page_change_data data = { + .set_mask = __pgprot(0), + .clear_mask = __pgprot(PTE_VALID), + }; + + if (!rodata_full) + return 0; + + return apply_to_page_range(&init_mm, + (unsigned long)page_address(page), + PAGE_SIZE, change_page_range, &data); +} + +int set_direct_map_default_noflush(struct page *page) +{ + struct page_change_data data = { + .set_mask = __pgprot(PTE_VALID | PTE_WRITE), + .clear_mask = __pgprot(PTE_RDONLY), + }; + + if (!rodata_full) + return 0; + + return apply_to_page_range(&init_mm, + (unsigned long)page_address(page), + PAGE_SIZE, change_page_range, &data); +} + void __kernel_map_pages(struct page *page, int numpages, int enable) { + if (!debug_pagealloc_enabled() && !rodata_full) + return; + set_memory_valid((unsigned long)page_address(page), numpages, enable); } -#ifdef CONFIG_HIBERNATION + /* - * When built with CONFIG_DEBUG_PAGEALLOC and CONFIG_HIBERNATION, this function - * is used to determine if a linear map page has been marked as not-valid by - * CONFIG_DEBUG_PAGEALLOC. Walk the page table and check the PTE_VALID bit. - * This is based on kern_addr_valid(), which almost does what we need. + * This function is used to determine if a linear map page has been marked as + * not-valid. Walk the page table and check the PTE_VALID bit. This is based + * on kern_addr_valid(), which almost does what we need. * * Because this is only called on the kernel linear map, p?d_sect() implies * p?d_present(). When debug_pagealloc is enabled, sections mappings are @@ -175,6 +205,9 @@ bool kernel_page_present(struct page *page) pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); + if (!debug_pagealloc_enabled() && !rodata_full) + return true; + pgdp = pgd_offset_k(addr); if (pgd_none(READ_ONCE(*pgdp))) return false; @@ -196,5 +229,3 @@ bool kernel_page_present(struct page *page) ptep = pte_offset_kernel(pmdp, addr); return pte_valid(READ_ONCE(*ptep)); } -#endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 9a0c7d5090d6..7548f9ca1f11 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -19,10 +19,12 @@ static struct kmem_cache *pgd_cache __ro_after_init; pgd_t *pgd_alloc(struct mm_struct *mm) { + gfp_t gfp = GFP_PGTABLE_USER; + if (PGD_SIZE == PAGE_SIZE) - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_page(gfp); else - return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); + return kmem_cache_alloc(pgd_cache, gfp); } void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 87c568807925..f5b437f8a22b 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -970,7 +970,7 @@ void *bpf_jit_alloc_exec(unsigned long size) { return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index eeb0471268a0..b4fb61c83494 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 # # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # config C6X def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select CLKDEV_LOOKUP diff --git a/arch/c6x/include/asm/flat.h b/arch/c6x/include/asm/flat.h index 76fd0bb962a3..9e6544b51386 100644 --- a/arch/c6x/include/asm/flat.h +++ b/arch/c6x/include/asm/flat.h @@ -4,11 +4,8 @@ #include <asm/unaligned.h> -#define flat_argvp_envp_on_stack() 0 -#define flat_old_ram_flag(flags) (flags) -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) + u32 *addr) { *addr = get_unaligned((__force u32 *)rp); return 0; @@ -18,7 +15,5 @@ static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel) put_unaligned(addr, (__force u32 *)rp); return 0; } -#define flat_get_relocate_addr(rel) (rel) -#define flat_set_persistent(relval, p) 0 #endif /* __ASM_C6X_FLAT_H */ diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index e72d9b6bc234..e456652facce 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -90,7 +90,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs) return regs->a4; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index c4785c9b67a2..ec61034fdf56 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -250,7 +250,7 @@ static void do_trap(struct exception_info *except_info, struct pt_regs *regs) die_if_kernel(except_info->kernel_str, regs, addr); force_sig_fault(except_info->signo, except_info->code, - (void __user *)addr, current); + (void __user *)addr); } /* diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index d789be36eb4f..27ef5b2c43ab 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -283,7 +283,7 @@ bad_area: do_exit(SIGKILL); } - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); } static struct ctl_table alignment_tbl[4] = { diff --git a/arch/csky/abiv2/fpu.c b/arch/csky/abiv2/fpu.c index e7e11344005a..86d187d4e5af 100644 --- a/arch/csky/abiv2/fpu.c +++ b/arch/csky/abiv2/fpu.c @@ -124,7 +124,7 @@ void fpu_fpe(struct pt_regs *regs) code = FPE_FLTRES; } - force_sig_fault(sig, code, (void __user *)regs->pc, current); + force_sig_fault(sig, code, (void __user *)regs->pc); } #define FMFVR_FPU_REGS(vrx, vry) \ diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index d213bb47b717..98c5716708d6 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -8,6 +8,9 @@ #include <linux/mm.h> #include <linux/sched.h> +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -39,33 +42,6 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) return pte; } -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); - if (!pte) - return NULL; - - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, PTE_ORDER); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_pages(pte, PTE_ORDER); -} - static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ORDER); diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index d47a3381aad8..9b1b7c039ddf 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -66,7 +66,6 @@ SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; - struct task_struct *task; sigset_t set; /* Always make any pending restarted system calls return -EINTR */ @@ -91,8 +90,7 @@ SYSCALL_DEFINE0(rt_sigreturn) return regs->a0; badframe: - task = current; - force_sig(SIGSEGV, task); + force_sig(SIGSEGV); return 0; } diff --git a/arch/csky/kernel/traps.c b/arch/csky/kernel/traps.c index f487a9b996ae..2792e9601ac5 100644 --- a/arch/csky/kernel/traps.c +++ b/arch/csky/kernel/traps.c @@ -106,7 +106,7 @@ void buserr(struct pt_regs *regs) pr_err("User mode Bus Error\n"); show_regs(regs); - force_sig_fault(SIGSEGV, 0, (void __user *)regs->pc, current); + force_sig_fault(SIGSEGV, 0, (void __user *)regs->pc); } #define USR_BKPT 0x1464 diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index 18041f46ded1..f76618b630f9 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -179,7 +179,7 @@ bad_area: bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { - force_sig_fault(SIGSEGV, si_code, (void __user *)address, current); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } @@ -212,5 +212,5 @@ do_sigbus: if (!user_mode(regs)) goto no_context; - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index ecfc4b4b6373..ec800e9d5aad 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -2,6 +2,9 @@ config H8300 def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_BINFMT_FLAT + select BINFMT_FLAT_ARGVP_ENVP_ON_STACK + select BINFMT_FLAT_OLD_ALWAYS_RAM select GENERIC_ATOMIC64 select HAVE_UID16 select VIRT_TO_BUS diff --git a/arch/h8300/include/asm/flat.h b/arch/h8300/include/asm/flat.h index f4cdfcbdd2ba..78070f924177 100644 --- a/arch/h8300/include/asm/flat.h +++ b/arch/h8300/include/asm/flat.h @@ -8,11 +8,6 @@ #include <asm/unaligned.h> -#define flat_argvp_envp_on_stack() 1 -#define flat_old_ram_flag(flags) 1 -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) -#define flat_set_persistent(relval, p) 0 - /* * on the H8 a couple of the relocations have an instruction in the * top byte. As there can only be 24bits of address space, we just @@ -22,7 +17,7 @@ #define flat_get_relocate_addr(rel) (rel & ~0x00000001) static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) + u32 *addr) { u32 val = get_unaligned((__force u32 *)rp); if (!(flags & FLAT_FLAG_GOTPIC)) diff --git a/arch/h8300/kernel/ptrace_h.c b/arch/h8300/kernel/ptrace_h.c index f5ff3b794c85..15db45a03b04 100644 --- a/arch/h8300/kernel/ptrace_h.c +++ b/arch/h8300/kernel/ptrace_h.c @@ -250,7 +250,7 @@ asmlinkage void trace_trap(unsigned long bp) { if ((unsigned long)current->thread.breakinfo.addr == bp) { user_disable_single_step(current); - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } else - force_sig(SIGILL, current); + force_sig(SIGILL); } diff --git a/arch/h8300/kernel/ptrace_s.c b/arch/h8300/kernel/ptrace_s.c index c0af930052c0..ee21f37b7ed4 100644 --- a/arch/h8300/kernel/ptrace_s.c +++ b/arch/h8300/kernel/ptrace_s.c @@ -40,5 +40,5 @@ void user_enable_single_step(struct task_struct *child) asmlinkage void trace_trap(unsigned long bp) { (void)bp; - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index e0f2b708e5d9..ef7489b7c459 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -126,7 +126,7 @@ asmlinkage int sys_rt_sigreturn(void) return er0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index 5bc36db26475..d48864c48e5a 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -252,6 +252,6 @@ asmlinkage int sys_rt_sigreturn(void) return regs->r00; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index a01da26dbfe1..69c623b14ddd 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -239,7 +239,7 @@ int die_if_kernel(char *str, struct pt_regs *regs, long err) static void misaligned_instruction(struct pt_regs *regs) { die_if_kernel("Misaligned Instruction", regs, 0); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } /* @@ -250,19 +250,19 @@ static void misaligned_instruction(struct pt_regs *regs) static void misaligned_data_load(struct pt_regs *regs) { die_if_kernel("Misaligned Data Load", regs, 0); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } static void misaligned_data_store(struct pt_regs *regs) { die_if_kernel("Misaligned Data Store", regs, 0); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } static void illegal_instruction(struct pt_regs *regs) { die_if_kernel("Illegal Instruction", regs, 0); - force_sig(SIGILL, current); + force_sig(SIGILL); } /* @@ -272,7 +272,7 @@ static void illegal_instruction(struct pt_regs *regs) static void precise_bus_error(struct pt_regs *regs) { die_if_kernel("Precise Bus Error", regs, 0); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } /* @@ -407,7 +407,7 @@ void do_trap0(struct pt_regs *regs) * may want to use a different trap0 flavor. */ force_sig_fault(SIGTRAP, TRAP_BRKPT, - (void __user *) pt_elr(regs), current); + (void __user *) pt_elr(regs)); } else { #ifdef CONFIG_KGDB kgdb_handle_exception(pt_cause(regs), SIGTRAP, diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index b7a99aa5b0ba..b3bc71680ae4 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -135,14 +135,14 @@ good_area: si_signo = SIGSEGV; si_code = SEGV_ACCERR; } - force_sig_fault(si_signo, si_code, (void __user *)address, current); + force_sig_fault(si_signo, si_code, (void __user *)address); return; bad_area: up_read(&mm->mmap_sem); if (user_mode(regs)) { - force_sig_fault(SIGSEGV, si_code, (void __user *)address, current); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } /* Kernel-mode fault falls through */ diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c index 7aeb48a18576..1a338e541334 100644 --- a/arch/ia64/hp/sim/simserial.c +++ b/arch/ia64/hp/sim/simserial.c @@ -324,8 +324,6 @@ static int rs_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) return -ENOIOCTLCMD; } -#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) - /* * This routine will shutdown a serial port; interrupts are disabled, and * DTR is dropped if the hangup on close termio flag is on. diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 206530d0751b..50440f3ddc43 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -124,10 +124,10 @@ ATOMIC_FETCH_OP(xor, ^) #undef ATOMIC_OP #define ATOMIC64_OP(op, c_op) \ -static __inline__ long \ -ia64_atomic64_##op (__s64 i, atomic64_t *v) \ +static __inline__ s64 \ +ia64_atomic64_##op (s64 i, atomic64_t *v) \ { \ - __s64 old, new; \ + s64 old, new; \ CMPXCHG_BUGCHECK_DECL \ \ do { \ @@ -139,10 +139,10 @@ ia64_atomic64_##op (__s64 i, atomic64_t *v) \ } #define ATOMIC64_FETCH_OP(op, c_op) \ -static __inline__ long \ -ia64_atomic64_fetch_##op (__s64 i, atomic64_t *v) \ +static __inline__ s64 \ +ia64_atomic64_fetch_##op (s64 i, atomic64_t *v) \ { \ - __s64 old, new; \ + s64 old, new; \ CMPXCHG_BUGCHECK_DECL \ \ do { \ @@ -162,7 +162,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_add_return(i,v) \ ({ \ - long __ia64_aar_i = (i); \ + s64 __ia64_aar_i = (i); \ __ia64_atomic_const(i) \ ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \ : ia64_atomic64_add(__ia64_aar_i, v); \ @@ -170,7 +170,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_sub_return(i,v) \ ({ \ - long __ia64_asr_i = (i); \ + s64 __ia64_asr_i = (i); \ __ia64_atomic_const(i) \ ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \ : ia64_atomic64_sub(__ia64_asr_i, v); \ @@ -178,7 +178,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_fetch_add(i,v) \ ({ \ - long __ia64_aar_i = (i); \ + s64 __ia64_aar_i = (i); \ __ia64_atomic_const(i) \ ? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq) \ : ia64_atomic64_fetch_add(__ia64_aar_i, v); \ @@ -186,7 +186,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_fetch_sub(i,v) \ ({ \ - long __ia64_asr_i = (i); \ + s64 __ia64_asr_i = (i); \ __ia64_atomic_const(i) \ ? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq) \ : ia64_atomic64_fetch_sub(__ia64_asr_i, v); \ diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c index c0239bf77a09..782c481d7052 100644 --- a/arch/ia64/kernel/brl_emu.c +++ b/arch/ia64/kernel/brl_emu.c @@ -197,21 +197,21 @@ ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec) */ printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n"); force_sig_fault(SIGILL, ILL_BADIADDR, (void __user *)NULL, - 0, 0, 0, current); + 0, 0, 0); } else if (ia64_psr(regs)->tb) { /* * Branch Tracing is enabled. * Force a taken branch signal. */ force_sig_fault(SIGTRAP, TRAP_BRANCH, (void __user *)NULL, - 0, 0, 0, current); + 0, 0, 0); } else if (ia64_psr(regs)->ss) { /* * Single Step is enabled. * Force a trace signal. */ force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)NULL, - 0, 0, 0, current); + 0, 0, 0); } return rv; } diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6a52d761854b..79190d877fa7 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, ti->cpu = cpu; p->stack = ti; p->state = TASK_UNINTERRUPTIBLE; - cpumask_set_cpu(cpu, &p->cpus_allowed); + cpumask_set_cpu(cpu, &p->cpus_mask); INIT_LIST_HEAD(&p->tasks); p->parent = p->real_parent = p->group_leader = p; INIT_LIST_HEAD(&p->children); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 58a6337c0690..7c52bd2695a2 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -6390,11 +6390,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) } /* save the current system wide pmu states */ - ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1); - if (ret) { - DPRINT(("on_each_cpu() failed: %d\n", ret)); - goto cleanup_reserve; - } + on_each_cpu(pfm_alt_save_pmu_state, NULL, 1); /* officially change to the alternate interrupt handler */ pfm_alt_intr_handler = hdl; @@ -6421,7 +6417,6 @@ int pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) { int i; - int ret; if (hdl == NULL) return -EINVAL; @@ -6435,10 +6430,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) pfm_alt_intr_handler = NULL; - ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1); - if (ret) { - DPRINT(("on_each_cpu() failed: %d\n", ret)); - } + on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1); for_each_online_cpu(i) { pfm_unreserve_session(NULL, 1, i); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 6062fd14e34e..e5044aed9452 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -152,7 +152,7 @@ ia64_rt_sigreturn (struct sigscratch *scr) return retval; give_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return retval; } @@ -257,7 +257,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr) */ check_sp = (new_sp - sizeof(*frame)) & -STACK_ALIGN; if (!likely(on_sig_stack(check_sp))) { - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return 1; } } @@ -265,7 +265,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr) frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN); if (!access_ok(frame, sizeof(*frame))) { - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return 1; } @@ -282,7 +282,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr) err |= setup_sigcontext(&frame->sc, set, scr); if (unlikely(err)) { - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return 1; } diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index e01df3f2f80d..ecc44926737b 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -354,3 +354,4 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 85d8616ac4f6..e13cb905930f 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -176,7 +176,7 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs) } force_sig_fault(sig, code, (void __user *) (regs->cr_iip + ia64_psr(regs)->ri), - break_num, 0 /* clear __ISR_VALID */, 0, current); + break_num, 0 /* clear __ISR_VALID */, 0); } /* @@ -353,7 +353,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) } force_sig_fault(SIGFPE, si_code, (void __user *) (regs->cr_iip + ia64_psr(regs)->ri), - 0, __ISR_VALID, isr, current); + 0, __ISR_VALID, isr); } } else { if (exception == -1) { @@ -373,7 +373,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) } force_sig_fault(SIGFPE, si_code, (void __user *) (regs->cr_iip + ia64_psr(regs)->ri), - 0, __ISR_VALID, isr, current); + 0, __ISR_VALID, isr); } } return 0; @@ -408,7 +408,7 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3, force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *) (regs.cr_iip + ia64_psr(®s)->ri), - 0, 0, 0, current); + 0, 0, 0); return rv; } @@ -483,7 +483,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, + ia64_psr(®s)->ri); } force_sig_fault(sig, code, addr, - vector, __ISR_VALID, isr, current); + vector, __ISR_VALID, isr); return; } else if (ia64_done_with_exception(®s)) return; @@ -493,7 +493,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, case 31: /* Unsupported Data Reference */ if (user_mode(®s)) { force_sig_fault(SIGILL, ILL_ILLOPN, (void __user *) iip, - vector, __ISR_VALID, isr, current); + vector, __ISR_VALID, isr); return; } sprintf(buf, "Unsupported data reference"); @@ -542,7 +542,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, == NOTIFY_STOP) return; force_sig_fault(SIGTRAP, si_code, (void __user *) ifa, - 0, __ISR_VALID, isr, current); + 0, __ISR_VALID, isr); return; case 32: /* fp fault */ @@ -550,7 +550,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, result = handle_fpu_swa((vector == 32) ? 1 : 0, ®s, isr); if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) { force_sig_fault(SIGFPE, FPE_FLTINV, (void __user *) iip, - 0, __ISR_VALID, isr, current); + 0, __ISR_VALID, isr); } return; @@ -578,7 +578,7 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, if (user_mode(®s)) { force_sig_fault(SIGILL, ILL_BADIADDR, (void __user *) iip, - 0, 0, 0, current); + 0, 0, 0); return; } sprintf(buf, "Unimplemented Instruction Address fault"); @@ -589,14 +589,14 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n"); printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n", iip, ifa, isr); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; case 46: printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n"); printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n", iip, ifa, isr, iim); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; case 47: @@ -608,5 +608,5 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, break; } if (!die_if_kernel(buf, ®s, error)) - force_sig(SIGILL, current); + force_sig(SIGILL); } diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index a167a3824b35..eb7d5df59fa3 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -1537,6 +1537,6 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) } force_sigbus: force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) ifa, - 0, 0, 0, current); + 0, 0, 0); goto done; } diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index edcdfc149311..16c6d377c502 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -121,8 +121,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL); if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) { atomic_set(&uc_pool->status, 0); - status = smp_call_function(uncached_ipi_visibility, uc_pool, 1); - if (status || atomic_read(&uc_pool->status)) + smp_call_function(uncached_ipi_visibility, uc_pool, 1); + if (atomic_read(&uc_pool->status)) goto failed; } else if (status != PAL_VISIBILITY_OK) goto failed; @@ -143,8 +143,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) if (status != PAL_STATUS_SUCCESS) goto failed; atomic_set(&uc_pool->status, 0); - status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1); - if (status || atomic_read(&uc_pool->status)) + smp_call_function(uncached_ipi_mc_drain, uc_pool, 1); + if (atomic_read(&uc_pool->status)) goto failed; /* diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 5baeb022f474..3c3a283d3172 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -249,7 +249,7 @@ retry: } if (user_mode(regs)) { force_sig_fault(signal, code, (void __user *) address, - 0, __ISR_VALID, isr, current); + 0, __ISR_VALID, isr); return; } diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 218e037ef901..c518d695c376 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -3,10 +3,15 @@ config M68K bool default y select ARCH_32BIT_OFF_T + select ARCH_HAS_BINFMT_FLAT + select ARCH_HAS_DMA_MMAP_PGPROT if MMU && !COLDFIRE + select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA select ARCH_MIGHT_HAVE_PC_PARPORT if ISA select ARCH_NO_COHERENT_DMA_MMAP if !MMU select ARCH_NO_PREEMPT if !COLDFIRE + select BINFMT_FLAT_ARGVP_ENVP_ON_STACK + select DMA_DIRECT_REMAP if HAS_DMA && MMU && !COLDFIRE select HAVE_IDE select HAVE_AOUT if MMU select HAVE_DEBUG_BUGVERBOSE diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index fea392cfcf1b..04e0f211afb3 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -71,9 +71,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -205,7 +202,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -231,7 +227,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -308,7 +303,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -436,6 +430,8 @@ CONFIG_FB_AMIGA_OCS=y CONFIG_FB_AMIGA_ECS=y CONFIG_FB_AMIGA_AGA=y CONFIG_FB_FM2=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m @@ -553,13 +549,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -583,7 +580,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -626,6 +622,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 2474d267460e..c6abbb535878 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -67,9 +67,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -201,7 +198,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -227,7 +223,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -304,7 +299,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -397,6 +391,8 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set CONFIG_FB=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_VGA16 is not set @@ -513,13 +509,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -543,7 +540,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -586,6 +582,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 0fc7d2992fe0..06ae65bad177 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -74,9 +74,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -208,7 +205,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -234,7 +230,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -311,7 +306,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -421,6 +415,8 @@ CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set CONFIG_FB=y CONFIG_FB_ATARI=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m @@ -535,13 +531,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -565,7 +562,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -608,6 +604,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 699df9fdf866..5616b94053b6 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -64,9 +64,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -198,7 +195,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -224,7 +220,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -301,7 +296,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -394,6 +388,8 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -506,13 +502,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -536,7 +533,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -579,6 +575,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index b50802255324..1106521f3b56 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -66,9 +66,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -200,7 +197,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -226,7 +222,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -303,7 +298,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -399,6 +393,8 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set CONFIG_FB=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set @@ -515,13 +511,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -545,7 +542,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -588,6 +584,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 04e7d70f6030..226c6c063cd4 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -65,9 +65,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -199,7 +196,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -225,7 +221,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -305,7 +300,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -423,6 +417,8 @@ CONFIG_PTP_1588_CLOCK=m CONFIG_FB=y CONFIG_FB_VALKYRIE=y CONFIG_FB_MAC=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_HID=m @@ -537,13 +533,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -567,7 +564,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -610,6 +606,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 5e1cc4c17852..39f603417928 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -85,9 +85,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -219,7 +216,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -245,7 +241,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -325,7 +320,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -499,6 +493,8 @@ CONFIG_FB_FM2=y CONFIG_FB_ATARI=y CONFIG_FB_VALKYRIE=y CONFIG_FB_MAC=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m @@ -619,13 +615,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -649,7 +646,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -692,6 +688,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 170ac8792c2d..175a607f576c 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -63,9 +63,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -197,7 +194,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -223,7 +219,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -300,7 +295,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -393,6 +387,8 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -505,13 +501,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -535,7 +532,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -578,6 +574,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index d865592a423e..f41c34d3cdd0 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -64,9 +64,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -198,7 +195,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -224,7 +220,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -301,7 +296,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -394,6 +388,8 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -506,13 +502,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -536,7 +533,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -579,6 +575,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 034a9de90484..c9d2cb0a1cf4 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -65,9 +65,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -199,7 +196,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -225,7 +221,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -302,7 +297,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -408,6 +402,8 @@ CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set CONFIG_FB=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m @@ -524,13 +520,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -554,7 +551,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -597,6 +593,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 49be0f9fcd8d..79a64fdd6bf0 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -61,9 +61,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -195,7 +192,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -221,7 +217,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -298,7 +293,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -394,6 +388,8 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set CONFIG_FB=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_HID=m @@ -508,13 +504,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -538,7 +535,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -581,6 +577,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a71acf4a6004..e3402a5d165b 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -61,9 +61,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m CONFIG_INET_ESP_OFFLOAD=m CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m CONFIG_INET_DIAG=m CONFIG_INET_UDP_DIAG=m CONFIG_INET_RAW_DIAG=m @@ -195,7 +192,6 @@ CONFIG_IP_SET_HASH_NETNET=m CONFIG_IP_SET_HASH_NETPORT=m CONFIG_IP_SET_HASH_NETIFACE=m CONFIG_IP_SET_LIST_SET=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NFT_DUP_IPV4=m CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y @@ -221,7 +217,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -298,7 +293,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -# CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m @@ -393,6 +387,8 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set CONFIG_FB=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_HID=m @@ -507,13 +503,14 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_RSA=m -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_RSA=m +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m CONFIG_CRYPTO_AEGIS128L=m @@ -537,7 +534,6 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -580,6 +576,7 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_STRING_HELPERS=m +CONFIG_TEST_STRSCPY=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m diff --git a/arch/m68k/include/asm/flat.h b/arch/m68k/include/asm/flat.h index 4f1d1e373420..46379e08cdd6 100644 --- a/arch/m68k/include/asm/flat.h +++ b/arch/m68k/include/asm/flat.h @@ -6,35 +6,7 @@ #ifndef __M68KNOMMU_FLAT_H__ #define __M68KNOMMU_FLAT_H__ -#include <linux/uaccess.h> - -#define flat_argvp_envp_on_stack() 1 -#define flat_old_ram_flag(flags) (flags) -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) -static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) -{ -#ifdef CONFIG_CPU_HAS_NO_UNALIGNED - return copy_from_user(addr, rp, 4) ? -EFAULT : 0; -#else - return get_user(*addr, rp); -#endif -} - -static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel) -{ -#ifdef CONFIG_CPU_HAS_NO_UNALIGNED - return copy_to_user(rp, &addr, 4) ? -EFAULT : 0; -#else - return put_user(addr, rp); -#endif -} -#define flat_get_relocate_addr(rel) (rel) - -static inline int flat_set_persistent(u32 relval, u32 *persistent) -{ - return 0; -} +#include <asm-generic/flat.h> #define FLAT_PLAT_INIT(regs) \ do { \ diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 1456c5eecbd9..1a8ddbd0d23c 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -13,55 +13,18 @@ #include <asm/tlb.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + extern const char bad_pmd_string[]; #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long) pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t page) -{ - pgtable_page_dtor(page); - __free_page(page); -} - #define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - unsigned long page = __get_free_page(GFP_KERNEL); - - if (!page) - return NULL; - - memset((void *)page, 0, PAGE_SIZE); - return (pte_t *) (page); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page = alloc_pages(GFP_KERNEL, 0); - - if (page == NULL) - return NULL; - - clear_highpage(page); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; - -} - static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { pmd_val(*pmd) = __pa((unsigned long)pte); diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index b4aa853051bd..30cd59caf037 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c @@ -18,57 +18,22 @@ #include <asm/pgalloc.h> #if defined(CONFIG_MMU) && !defined(CONFIG_COLDFIRE) - -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, - gfp_t flag, unsigned long attrs) +void arch_dma_prep_coherent(struct page *page, size_t size) { - struct page *page, **map; - pgprot_t pgprot; - void *addr; - int i, order; - - pr_debug("dma_alloc_coherent: %d,%x\n", size, flag); - - size = PAGE_ALIGN(size); - order = get_order(size); - - page = alloc_pages(flag | __GFP_ZERO, order); - if (!page) - return NULL; - - *handle = page_to_phys(page); - map = kmalloc(sizeof(struct page *) << order, flag & ~__GFP_DMA); - if (!map) { - __free_pages(page, order); - return NULL; - } - split_page(page, order); - - order = 1 << order; - size >>= PAGE_SHIFT; - map[0] = page; - for (i = 1; i < size; i++) - map[i] = page + i; - for (; i < order; i++) - __free_page(page + i); - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY); - if (CPU_IS_040_OR_060) - pgprot_val(pgprot) |= _PAGE_GLOBAL040 | _PAGE_NOCACHE_S; - else - pgprot_val(pgprot) |= _PAGE_NOCACHE030; - addr = vmap(map, size, VM_MAP, pgprot); - kfree(map); - - return addr; + cache_push(page_to_phys(page), size); } -void arch_dma_free(struct device *dev, size_t size, void *addr, - dma_addr_t handle, unsigned long attrs) +pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, + unsigned long attrs) { - pr_debug("dma_free_coherent: %p, %x\n", addr, handle); - vfree(addr); + if (CPU_IS_040_OR_060) { + pgprot_val(prot) &= ~_PAGE_CACHE040; + pgprot_val(prot) |= _PAGE_GLOBAL040 | _PAGE_NOCACHE_S; + } else { + pgprot_val(prot) |= _PAGE_NOCACHE030; + } + return prot; } - #else #include <asm/cacheflush.h> diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 87e7f3639839..05610e6924c1 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -803,7 +803,7 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) return regs->d0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -825,7 +825,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) return regs->d0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 7e3d0734b2f3..9a3eb2558568 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -433,3 +433,4 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index b2fd000b9285..344f93d36a9a 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -431,7 +431,7 @@ static inline void bus_error030 (struct frame *fp) pr_err("BAD KERNEL BUSERR\n"); die_if_kernel("Oops", &fp->ptregs,0); - force_sig(SIGKILL, current); + force_sig(SIGKILL); return; } } else { @@ -463,7 +463,7 @@ static inline void bus_error030 (struct frame *fp) !(ssw & RW) ? "write" : "read", addr, fp->ptregs.pc); die_if_kernel ("Oops", &fp->ptregs, buserr_type); - force_sig (SIGBUS, current); + force_sig (SIGBUS); return; } @@ -493,7 +493,7 @@ static inline void bus_error030 (struct frame *fp) do_page_fault (&fp->ptregs, addr, 0); } else { pr_debug("protection fault on insn access (segv).\n"); - force_sig (SIGSEGV, current); + force_sig (SIGSEGV); } } #else @@ -571,7 +571,7 @@ static inline void bus_error030 (struct frame *fp) !(ssw & RW) ? "write" : "read", addr, fp->ptregs.pc); die_if_kernel("Oops",&fp->ptregs,mmusr); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; } else { #if 0 @@ -598,7 +598,7 @@ static inline void bus_error030 (struct frame *fp) #endif pr_debug("Unknown SIGSEGV - 1\n"); die_if_kernel("Oops",&fp->ptregs,mmusr); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; } @@ -621,7 +621,7 @@ static inline void bus_error030 (struct frame *fp) buserr: pr_err("BAD KERNEL BUSERR\n"); die_if_kernel("Oops",&fp->ptregs,0); - force_sig(SIGKILL, current); + force_sig(SIGKILL); return; } @@ -660,7 +660,7 @@ static inline void bus_error030 (struct frame *fp) addr, fp->ptregs.pc); pr_debug("Unknown SIGSEGV - 2\n"); die_if_kernel("Oops",&fp->ptregs,mmusr); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; } @@ -804,7 +804,7 @@ asmlinkage void buserr_c(struct frame *fp) default: die_if_kernel("bad frame format",&fp->ptregs,0); pr_debug("Unknown SIGSEGV - 4\n"); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } } @@ -1127,7 +1127,7 @@ asmlinkage void trap_c(struct frame *fp) addr = (void __user*) fp->un.fmtb.daddr; break; } - force_sig_fault(sig, si_code, addr, current); + force_sig_fault(sig, si_code, addr); } void die_if_kernel (char *str, struct pt_regs *fp, int nr) @@ -1159,6 +1159,6 @@ asmlinkage void fpsp040_die(void) #ifdef CONFIG_M68KFPU_EMU asmlinkage void fpemu_signal(int signal, int code, void *addr) { - force_sig_fault(signal, code, addr, current); + force_sig_fault(signal, code, addr); } #endif diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 11be08f4f750..205ac75da13d 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -911,6 +911,10 @@ static const struct resource mac_scsi_iifx_rsrc[] __initconst = { .flags = IORESOURCE_MEM, .start = 0x50008000, .end = 0x50009FFF, + }, { + .flags = IORESOURCE_MEM, + .start = 0x50008000, + .end = 0x50009FFF, }, }; @@ -1012,10 +1016,12 @@ int __init mac_platform_init(void) case MAC_SCSI_IIFX: /* Addresses from The Guide to Mac Family Hardware. * $5000 8000 - $5000 9FFF: SCSI DMA + * $5000 A000 - $5000 BFFF: Alternate SCSI * $5000 C000 - $5000 DFFF: Alternate SCSI (DMA) * $5000 E000 - $5000 FFFF: Alternate SCSI (Hsk) - * The SCSI DMA custom IC embeds the 53C80 core. mac_scsi does - * not make use of its DMA or hardware handshaking logic. + * The A/UX header file sys/uconfig.h says $50F0 8000. + * The "SCSI DMA" custom IC embeds the 53C80 core and + * supports Programmed IO, DMA and PDMA (hardware handshake). */ platform_device_register_simple("mac_scsi", 0, mac_scsi_iifx_rsrc, ARRAY_SIZE(mac_scsi_iifx_rsrc)); diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 9b6163c05a75..e9b1d7585b43 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -30,13 +30,13 @@ int send_fault_sig(struct pt_regs *regs) pr_debug("send_fault_sig: %p,%d,%d\n", addr, signo, si_code); if (user_mode(regs)) { - force_sig_fault(signo, si_code, addr, current); + force_sig_fault(signo, si_code, addr); } else { if (fixup_exception(regs)) return -1; //if (signo == SIGBUS) - // force_sig_fault(si_signo, si_code, addr, current); + // force_sig_fault(si_signo, si_code, addr); /* * Oops. The kernel tried to access some bad page. We'll have to diff --git a/arch/m68k/q40/README b/arch/m68k/q40/README index 93f4c4cd3c45..a4991d2d8af6 100644 --- a/arch/m68k/q40/README +++ b/arch/m68k/q40/README @@ -31,7 +31,7 @@ drivers used by the Q40, apart from the very obvious (console etc.): char/joystick/* # most of this should work, not # in default config.in block/q40ide.c # startup for ide - ide* # see Documentation/ide/ide.txt + ide* # see Documentation/ide/ide.rst floppy.c # normal PC driver, DMA emu in asm/floppy.h # and arch/m68k/kernel/entry.S # see drivers/block/README.fd diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index f11433daab4a..d411de05b628 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -3,6 +3,7 @@ config MICROBLAZE def_bool y select ARCH_32BIT_OFF_T select ARCH_NO_SWAP + select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/microblaze/Kconfig.debug b/arch/microblaze/Kconfig.debug index 3a343188d86c..865527ac332a 100644 --- a/arch/microblaze/Kconfig.debug +++ b/arch/microblaze/Kconfig.debug @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/microblaze/Kconfig.platform b/arch/microblaze/Kconfig.platform index 5bf54c1d4f60..7795f90dad86 100644 --- a/arch/microblaze/Kconfig.platform +++ b/arch/microblaze/Kconfig.platform @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # # Platform selection Kconfig menu for MicroBlaze targets # diff --git a/arch/microblaze/include/asm/flat.h b/arch/microblaze/include/asm/flat.h index 3d2747d4c967..1ab86770eaee 100644 --- a/arch/microblaze/include/asm/flat.h +++ b/arch/microblaze/include/asm/flat.h @@ -13,11 +13,6 @@ #include <asm/unaligned.h> -#define flat_argvp_envp_on_stack() 0 -#define flat_old_ram_flag(flags) (flags) -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) -#define flat_set_persistent(relval, p) 0 - /* * Microblaze works a little differently from other arches, because * of the MICROBLAZE_64 reloc type. Here, a 32 bit address is split @@ -33,7 +28,7 @@ */ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) + u32 *addr) { u32 *p = (__force u32 *)rp; diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c index eafff21fcb0e..cf99c411503e 100644 --- a/arch/microblaze/kernel/exceptions.c +++ b/arch/microblaze/kernel/exceptions.c @@ -63,7 +63,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) if (kernel_mode(regs)) die("Exception in kernel mode", regs, signr); - force_sig_fault(signr, code, (void __user *)addr, current); + force_sig_fault(signr, code, (void __user *)addr); } asmlinkage void full_exception(struct pt_regs *regs, unsigned int type, diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 0685696349bb..cdd4feb279c5 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -108,7 +108,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return rval; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 26339e417695..09b0cd7dab0a 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -439,3 +439,5 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 202ad6a494f5..e6a810b0c7ad 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -289,7 +289,7 @@ out_of_memory: do_sigbus: up_read(&mm->mmap_sem); if (user_mode(regs)) { - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); return; } bad_page_fault(regs, address, SIGBUS); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 70d3200476bf..d50fafd7bf3a 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -34,6 +34,7 @@ config MIPS select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GUP_GET_PTE_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HANDLE_DOMAIN_IRQ select HAVE_ARCH_COMPILER_H select HAVE_ARCH_JUMP_LABEL @@ -52,6 +53,7 @@ config MIPS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EXIT_THREAD + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER @@ -1119,6 +1121,7 @@ config DMA_NONCOHERENT bool select ARCH_HAS_DMA_MMAP_PGPROT select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_UNCACHED_SEGMENT select NEED_DMA_MAP_STATE select ARCH_HAS_DMA_COHERENT_TO_PFN select DMA_NONCOHERENT_CACHE_SYNC diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 8f4486c4415b..eceff9b75b22 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -17,6 +17,7 @@ archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/mips/boot/tools relocs KBUILD_DEFCONFIG := 32r2el_defconfig +KBUILD_DTBS := dtbs # # Select the object file format to substitute into the linker script. @@ -384,7 +385,7 @@ quiet_cmd_64 = OBJCOPY $@ vmlinux.64: vmlinux $(call cmd,64) -all: $(all-y) +all: $(all-y) $(KBUILD_DTBS) # boot $(boot-y): $(vmlinux-32) FORCE diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 3c453a1f1ff1..172801ed35b8 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -78,6 +78,8 @@ OBJCOPYFLAGS_piggy.o := --add-section=.image=$(obj)/vmlinux.bin.z \ $(obj)/piggy.o: $(obj)/dummy.o $(obj)/vmlinux.bin.z FORCE $(call if_changed,objcopy) +HOSTCFLAGS_calc_vmlinuz_load_addr.o += $(LINUXINCLUDE) + # Calculate the load address of the compressed kernel image hostprogs-y := calc_vmlinuz_load_addr diff --git a/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c b/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c index 240f1d12df75..080b926d2623 100644 --- a/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c +++ b/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c @@ -9,7 +9,7 @@ #include <stdint.h> #include <stdio.h> #include <stdlib.h> -#include "../../../../include/linux/sizes.h" +#include <linux/sizes.h> int main(int argc, char *argv[]) { diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi index 90c60d42f571..33ae74aaa1bb 100644 --- a/arch/mips/boot/dts/mscc/ocelot.dtsi +++ b/arch/mips/boot/dts/mscc/ocelot.dtsi @@ -132,11 +132,12 @@ <0x1270000 0x100>, <0x1280000 0x100>, <0x1800000 0x80000>, - <0x1880000 0x10000>; + <0x1880000 0x10000>, + <0x1060000 0x10000>; reg-names = "sys", "rew", "qs", "port0", "port1", "port2", "port3", "port4", "port5", "port6", "port7", "port8", "port9", "port10", "qsys", - "ana"; + "ana", "s2"; interrupts = <21 22>; interrupt-names = "xtr", "inj"; diff --git a/arch/mips/boot/dts/qca/ar9331.dtsi b/arch/mips/boot/dts/qca/ar9331.dtsi index 2bae201aa365..63a9f33aa43e 100644 --- a/arch/mips/boot/dts/qca/ar9331.dtsi +++ b/arch/mips/boot/dts/qca/ar9331.dtsi @@ -116,6 +116,32 @@ }; }; + eth0: ethernet@19000000 { + compatible = "qca,ar9330-eth"; + reg = <0x19000000 0x200>; + interrupts = <4>; + + resets = <&rst 9>, <&rst 22>; + reset-names = "mac", "mdio"; + clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_AHB>; + clock-names = "eth", "mdio"; + + status = "disabled"; + }; + + eth1: ethernet@1a000000 { + compatible = "qca,ar9330-eth"; + reg = <0x1a000000 0x200>; + interrupts = <5>; + + resets = <&rst 13>, <&rst 23>; + reset-names = "mac", "mdio"; + clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_AHB>; + clock-names = "eth", "mdio"; + + status = "disabled"; + }; + usb: usb@1b000100 { compatible = "chipidea,usb2"; reg = <0x1b000000 0x200>; diff --git a/arch/mips/boot/dts/qca/ar9331_dpt_module.dts b/arch/mips/boot/dts/qca/ar9331_dpt_module.dts index e7af2cf5f4c1..77bab823eb3b 100644 --- a/arch/mips/boot/dts/qca/ar9331_dpt_module.dts +++ b/arch/mips/boot/dts/qca/ar9331_dpt_module.dts @@ -76,3 +76,11 @@ reg = <0>; }; }; + +ð0 { + status = "okay"; +}; + +ð1 { + status = "okay"; +}; diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig index 0ee5e677662e..0de92ac1ca64 100644 --- a/arch/mips/configs/malta_defconfig +++ b/arch/mips/configs/malta_defconfig @@ -210,7 +210,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig index 041bffac043b..efc3abace048 100644 --- a/arch/mips/configs/malta_kvm_defconfig +++ b/arch/mips/configs/malta_kvm_defconfig @@ -215,7 +215,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/malta_kvm_guest_defconfig b/arch/mips/configs/malta_kvm_guest_defconfig index 511065e62182..c6ceeca4394d 100644 --- a/arch/mips/configs/malta_kvm_guest_defconfig +++ b/arch/mips/configs/malta_kvm_guest_defconfig @@ -212,7 +212,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/malta_qemu_32r6_defconfig b/arch/mips/configs/malta_qemu_32r6_defconfig index 299088043164..e6c600dc1814 100644 --- a/arch/mips/configs/malta_qemu_32r6_defconfig +++ b/arch/mips/configs/malta_qemu_32r6_defconfig @@ -74,7 +74,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltaaprp_defconfig b/arch/mips/configs/maltaaprp_defconfig index 2b4b3a24f637..82b44b774553 100644 --- a/arch/mips/configs/maltaaprp_defconfig +++ b/arch/mips/configs/maltaaprp_defconfig @@ -76,7 +76,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltasmvp_defconfig b/arch/mips/configs/maltasmvp_defconfig index 425ddfd7cd78..4190fc6189a0 100644 --- a/arch/mips/configs/maltasmvp_defconfig +++ b/arch/mips/configs/maltasmvp_defconfig @@ -77,7 +77,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltasmvp_eva_defconfig b/arch/mips/configs/maltasmvp_eva_defconfig index 8beaa7ba1e52..a13c10e910ec 100644 --- a/arch/mips/configs/maltasmvp_eva_defconfig +++ b/arch/mips/configs/maltasmvp_eva_defconfig @@ -78,7 +78,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltaup_defconfig b/arch/mips/configs/maltaup_defconfig index 6e8b95ceb54a..b35f1fc690fb 100644 --- a/arch/mips/configs/maltaup_defconfig +++ b/arch/mips/configs/maltaup_defconfig @@ -75,7 +75,6 @@ CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y -CONFIG_NET_CLS_IND=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_LOOP=y diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig index 6c026db96ff9..56861aef2756 100644 --- a/arch/mips/configs/maltaup_xpa_defconfig +++ b/arch/mips/configs/maltaup_xpa_defconfig @@ -212,7 +212,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_CFG80211=m CONFIG_MAC80211=m CONFIG_MAC80211_MESH=y diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig index 50632a3103dd..864c70fbe668 100644 --- a/arch/mips/configs/rb532_defconfig +++ b/arch/mips/configs/rb532_defconfig @@ -103,7 +103,6 @@ CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_HAMRADIO=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 94096299fc56..9a82dd11c0e9 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -254,10 +254,10 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v) #define atomic64_set(v, i) WRITE_ONCE((v)->counter, (i)) #define ATOMIC64_OP(op, c_op, asm_op) \ -static __inline__ void atomic64_##op(long i, atomic64_t * v) \ +static __inline__ void atomic64_##op(s64 i, atomic64_t * v) \ { \ if (kernel_uses_llsc) { \ - long temp; \ + s64 temp; \ \ loongson_llsc_mb(); \ __asm__ __volatile__( \ @@ -280,12 +280,12 @@ static __inline__ void atomic64_##op(long i, atomic64_t * v) \ } #define ATOMIC64_OP_RETURN(op, c_op, asm_op) \ -static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \ +static __inline__ s64 atomic64_##op##_return_relaxed(s64 i, atomic64_t * v) \ { \ - long result; \ + s64 result; \ \ if (kernel_uses_llsc) { \ - long temp; \ + s64 temp; \ \ loongson_llsc_mb(); \ __asm__ __volatile__( \ @@ -314,12 +314,12 @@ static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \ } #define ATOMIC64_FETCH_OP(op, c_op, asm_op) \ -static __inline__ long atomic64_fetch_##op##_relaxed(long i, atomic64_t * v) \ +static __inline__ s64 atomic64_fetch_##op##_relaxed(s64 i, atomic64_t * v) \ { \ - long result; \ + s64 result; \ \ if (kernel_uses_llsc) { \ - long temp; \ + s64 temp; \ \ loongson_llsc_mb(); \ __asm__ __volatile__( \ @@ -386,14 +386,14 @@ ATOMIC64_OPS(xor, ^=, xor) * Atomically test @v and subtract @i if @v is greater or equal than @i. * The function returns the old value of @v minus @i. */ -static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v) +static __inline__ s64 atomic64_sub_if_positive(s64 i, atomic64_t * v) { - long result; + s64 result; smp_mb__before_llsc(); if (kernel_uses_llsc) { - long temp; + s64 temp; __asm__ __volatile__( " .set push \n" diff --git a/arch/mips/include/asm/mach-ath79/ar933x_uart.h b/arch/mips/include/asm/mach-ath79/ar933x_uart.h index b8f8af7dc47c..cacf3545e018 100644 --- a/arch/mips/include/asm/mach-ath79/ar933x_uart.h +++ b/arch/mips/include/asm/mach-ath79/ar933x_uart.h @@ -24,8 +24,8 @@ #define AR933X_UART_CS_PARITY_S 0 #define AR933X_UART_CS_PARITY_M 0x3 #define AR933X_UART_CS_PARITY_NONE 0 -#define AR933X_UART_CS_PARITY_ODD 1 -#define AR933X_UART_CS_PARITY_EVEN 2 +#define AR933X_UART_CS_PARITY_ODD 2 +#define AR933X_UART_CS_PARITY_EVEN 3 #define AR933X_UART_CS_IF_MODE_S 2 #define AR933X_UART_CS_IF_MODE_M 0x3 #define AR933X_UART_CS_IF_MODE_NONE 0 diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index a25643d258cb..0ba4ce6e2bf3 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -258,9 +258,6 @@ extern bool __virt_addr_valid(const volatile void *kaddr); ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define UNCAC_ADDR(addr) (UNCAC_BASE + __pa(addr)) -#define CAC_ADDR(addr) ((unsigned long)__va((addr) - UNCAC_BASE)) - #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 27808d9461f4..aa16b85ddffc 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -13,6 +13,8 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -50,37 +52,6 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PTE_ORDER); -} - -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL, PTE_ORDER); - if (!pte) - return NULL; - clear_highpage(pte); - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, PTE_ORDER); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_pages(pte, PTE_ORDER); -} - #define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4ccb465ef3f2..7d27194e3b45 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -20,6 +20,7 @@ #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/pgtable-bits.h> +#include <asm/cpu-features.h> struct mm_struct; struct vm_area_struct; @@ -626,6 +627,8 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define gup_fast_permitted(start, end) (!cpu_has_dc_aliases) + #include <asm-generic/pgtable.h> /* diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h index 0f813bb753c6..09cbe9042828 100644 --- a/arch/mips/include/asm/switch_to.h +++ b/arch/mips/include/asm/switch_to.h @@ -42,7 +42,7 @@ extern struct task_struct *ll_task; * inline to try to keep the overhead down. If we have been forced to run on * a "CPU" with an FPU because of a previous high level of FP computation, * but did not actually use the FPU during the most recent time-slice (CU1 - * isn't set), we undo the restriction on cpus_allowed. + * isn't set), we undo the restriction on cpus_mask. * * We're not calling set_cpus_allowed() here, because we have no need to * force prompt migration - we're already switching the current CPU to a @@ -57,7 +57,7 @@ do { \ test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ (!(KSTK_STATUS(prev) & ST0_CU1))) { \ clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ - prev->cpus_allowed = prev->thread.user_cpus_allowed; \ + prev->cpus_mask = prev->thread.user_cpus_allowed; \ } \ next->thread.emulated_fp = 0; \ } while(0) diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d41765cfbc6e..d0a9ed2ca2d6 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -133,6 +133,8 @@ #define SO_RCVTIMEO_NEW 66 #define SO_SNDTIMEO_NEW 67 +#define SO_DETACH_REUSEPORT_BPF 68 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index bedb5047aff3..1804dc9d8136 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -575,10 +575,6 @@ static void *jazz_dma_alloc(struct device *dev, size_t size, return NULL; } - if (!(attrs & DMA_ATTR_NON_CONSISTENT)) { - dma_cache_wback_inv((unsigned long)ret, size); - ret = (void *)UNCAC_ADDR(ret); - } return ret; } @@ -586,8 +582,6 @@ static void jazz_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { vdma_free(dma_handle); - if (!(attrs & DMA_ATTR_NON_CONSISTENT)) - vaddr = (void *)CAC_ADDR((unsigned long)vaddr); dma_direct_free_pages(dev, size, vaddr, dma_handle, attrs); } diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 180ad081afcf..1db29957a931 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -32,7 +32,7 @@ int __isa_exception_epc(struct pt_regs *regs) /* Calculate exception PC in branch delay slot. */ if (__get_user(inst, (u16 __user *) msk_isa16_mode(epc))) { /* This should never happen because delay slot was checked. */ - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return epc; } if (cpu_has_mips16) { @@ -305,7 +305,7 @@ int __microMIPS_compute_return_epc(struct pt_regs *regs) return 0; sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return -EFAULT; } @@ -328,7 +328,7 @@ int __MIPS16e_compute_return_epc(struct pt_regs *regs) /* Read the instruction. */ addr = (u16 __user *)msk_isa16_mode(epc); if (__get_user(inst.full, addr)) { - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return -EFAULT; } @@ -343,7 +343,7 @@ int __MIPS16e_compute_return_epc(struct pt_regs *regs) case MIPS16e_jal_op: addr += 1; if (__get_user(inst2, addr)) { - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return -EFAULT; } fullinst = ((unsigned)inst.full << 16) | inst2; @@ -829,17 +829,17 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, sigill_dsp: pr_debug("%s: DSP branch but not DSP ASE - sending SIGILL.\n", current->comm); - force_sig(SIGILL, current); + force_sig(SIGILL); return -EFAULT; sigill_r2r6: pr_debug("%s: R2 branch but r2-to-r6 emulator is not present - sending SIGILL.\n", current->comm); - force_sig(SIGILL, current); + force_sig(SIGILL); return -EFAULT; sigill_r6: pr_debug("%s: R6 branch but no MIPSr6 ISA support - sending SIGILL.\n", current->comm); - force_sig(SIGILL, current); + force_sig(SIGILL); return -EFAULT; } EXPORT_SYMBOL_GPL(__compute_return_epc_for_insn); @@ -859,7 +859,7 @@ int __compute_return_epc(struct pt_regs *regs) */ addr = (unsigned int __user *) epc; if (__get_user(insn.word, addr)) { - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return -EFAULT; } @@ -867,7 +867,7 @@ int __compute_return_epc(struct pt_regs *regs) unaligned: printk("%s: unaligned epc - sending SIGBUS.\n", current->comm); - force_sig(SIGBUS, current); + force_sig(SIGBUS); return -EFAULT; } diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index 07c941c99e92..81ba1d3c367c 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -220,7 +220,7 @@ static int evaluate_branch_instruction(struct kprobe *p, struct pt_regs *regs, unaligned: pr_notice("%s: unaligned epc - sending SIGBUS.\n", current->comm); - force_sig(SIGBUS, current); + force_sig(SIGBUS); return -EFAULT; } diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index a7c0f97e4b0d..1a08428eedcf 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, if (retval) goto out_unlock; - cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); + cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr); cpumask_and(&mask, &allowed, cpu_active_mask); out_unlock: diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index d75337974ee9..f6efabcb4e92 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -641,7 +641,7 @@ asmlinkage void sys_sigreturn(void) if (sig < 0) goto badframe; else if (sig) - force_sig(sig, current); + force_sig(sig); /* * Don't let your children do this ... @@ -654,7 +654,7 @@ asmlinkage void sys_sigreturn(void) /* Unreached */ badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } #endif /* CONFIG_TRAD_SIGNALS */ @@ -678,7 +678,7 @@ asmlinkage void sys_rt_sigreturn(void) if (sig < 0) goto badframe; else if (sig) - force_sig(sig, current); + force_sig(sig); if (restore_altstack(&frame->rs_uc.uc_stack)) goto badframe; @@ -694,7 +694,7 @@ asmlinkage void sys_rt_sigreturn(void) /* Unreached */ badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } #ifdef CONFIG_TRAD_SIGNALS diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 9a6e58b48bb6..7bd00fad61af 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -71,7 +71,7 @@ asmlinkage void sysn32_rt_sigreturn(void) if (sig < 0) goto badframe; else if (sig) - force_sig(sig, current); + force_sig(sig); if (compat_restore_altstack(&frame->rs_uc.uc_stack)) goto badframe; @@ -87,7 +87,7 @@ asmlinkage void sysn32_rt_sigreturn(void) /* Unreached */ badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } static int setup_rt_frame_n32(void *sig_return, struct ksignal *ksig, diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c index df259618e834..299a7a28ca33 100644 --- a/arch/mips/kernel/signal_o32.c +++ b/arch/mips/kernel/signal_o32.c @@ -171,7 +171,7 @@ asmlinkage void sys32_rt_sigreturn(void) if (sig < 0) goto badframe; else if (sig) - force_sig(sig, current); + force_sig(sig); if (compat_restore_altstack(&frame->rs_uc.uc_stack)) goto badframe; @@ -187,7 +187,7 @@ asmlinkage void sys32_rt_sigreturn(void) /* Unreached */ badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } static int setup_rt_frame_32(void *sig_return, struct ksignal *ksig, @@ -273,7 +273,7 @@ asmlinkage void sys32_sigreturn(void) if (sig < 0) goto badframe; else if (sig) - force_sig(sig, current); + force_sig(sig); /* * Don't let your children do this ... @@ -286,5 +286,5 @@ asmlinkage void sys32_sigreturn(void) /* Unreached */ badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 0e2dd68ade57..97035e19ad03 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -372,3 +372,4 @@ 431 n32 fsconfig sys_fsconfig 432 n32 fsmount sys_fsmount 433 n32 fspick sys_fspick +434 n32 pidfd_open sys_pidfd_open diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 5eebfa0d155c..d7292722d3b0 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -348,3 +348,4 @@ 431 n64 fsconfig sys_fsconfig 432 n64 fsmount sys_fsmount 433 n64 fspick sys_fspick +434 n64 pidfd_open sys_pidfd_open diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 3cc1374e02d0..dba084c92f14 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -421,3 +421,4 @@ 431 o32 fsconfig sys_fsconfig 432 o32 fsmount sys_fsmount 433 o32 fspick sys_fspick +434 o32 pidfd_open sys_pidfd_open diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index c52766a5b85f..342e41de9d64 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -482,7 +482,7 @@ asmlinkage void do_be(struct pt_regs *regs) goto out; die_if_kernel("Oops", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); out: exception_exit(prev_state); @@ -705,7 +705,7 @@ asmlinkage void do_ov(struct pt_regs *regs) prev_state = exception_enter(); die_if_kernel("Integer overflow", regs); - force_sig_fault(SIGFPE, FPE_INTOVF, (void __user *)regs->cp0_epc, current); + force_sig_fault(SIGFPE, FPE_INTOVF, (void __user *)regs->cp0_epc); exception_exit(prev_state); } @@ -733,7 +733,7 @@ void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr, else if (fcr31 & FPU_CSR_INE_X) si_code = FPE_FLTRES; - force_sig_fault(SIGFPE, si_code, fault_addr, tsk); + force_sig_fault_to_task(SIGFPE, si_code, fault_addr, tsk); } int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31) @@ -750,7 +750,7 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31) return 1; case SIGBUS: - force_sig_fault(SIGBUS, BUS_ADRERR, fault_addr, current); + force_sig_fault(SIGBUS, BUS_ADRERR, fault_addr); return 1; case SIGSEGV: @@ -761,11 +761,11 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31) else si_code = SEGV_MAPERR; up_read(¤t->mm->mmap_sem); - force_sig_fault(SIGSEGV, si_code, fault_addr, current); + force_sig_fault(SIGSEGV, si_code, fault_addr); return 1; default: - force_sig(sig, current); + force_sig(sig); return 1; } } @@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void) * restricted the allowed set to exclude any CPUs with FPUs, * we'll skip the procedure. */ - if (cpumask_intersects(¤t->cpus_allowed, &mt_fpu_cpumask)) { + if (cpumask_intersects(¤t->cpus_mask, &mt_fpu_cpumask)) { cpumask_t tmask; current->thread.user_cpus_allowed - = current->cpus_allowed; - cpumask_and(&tmask, ¤t->cpus_allowed, + = current->cpus_mask; + cpumask_and(&tmask, ¤t->cpus_mask, &mt_fpu_cpumask); set_cpus_allowed_ptr(current, &tmask); set_thread_flag(TIF_FPUBOUND); @@ -943,11 +943,11 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code, die_if_kernel(b, regs); force_sig_fault(SIGFPE, code == BRK_DIVZERO ? FPE_INTDIV : FPE_INTOVF, - (void __user *) regs->cp0_epc, current); + (void __user *) regs->cp0_epc); break; case BRK_BUG: die_if_kernel("Kernel bug detected", regs); - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); break; case BRK_MEMU: /* @@ -962,15 +962,15 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code, return; die_if_kernel("Math emu break/trap", regs); - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); break; default: scnprintf(b, sizeof(b), "%s instruction in kernel code", str); die_if_kernel(b, regs); if (si_code) { - force_sig_fault(SIGTRAP, si_code, NULL, current); + force_sig_fault(SIGTRAP, si_code, NULL); } else { - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } } } @@ -1063,7 +1063,7 @@ out: return; out_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); goto out; } @@ -1105,7 +1105,7 @@ out: return; out_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); goto out; } @@ -1191,7 +1191,7 @@ no_r2_instr: if (unlikely(status > 0)) { regs->cp0_epc = old_epc; /* Undo skip-over. */ regs->regs[31] = old31; - force_sig(status, current); + force_sig(status); } out: @@ -1220,7 +1220,7 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action, die_if_kernel("COP2: Unhandled kernel unaligned access or invalid " "instruction", regs); - force_sig(SIGILL, current); + force_sig(SIGILL); return NOTIFY_OK; } @@ -1383,7 +1383,7 @@ asmlinkage void do_cpu(struct pt_regs *regs) if (unlikely(status > 0)) { regs->cp0_epc = old_epc; /* Undo skip-over. */ regs->regs[31] = old31; - force_sig(status, current); + force_sig(status); } break; @@ -1403,7 +1403,7 @@ asmlinkage void do_cpu(struct pt_regs *regs) * emulator too. */ if (raw_cpu_has_fpu || !cpu_has_mips_4_5_64_r2_r6) { - force_sig(SIGILL, current); + force_sig(SIGILL); break; } /* Fall through. */ @@ -1437,7 +1437,7 @@ asmlinkage void do_cpu(struct pt_regs *regs) #else /* CONFIG_MIPS_FP_SUPPORT */ case 1: case 3: - force_sig(SIGILL, current); + force_sig(SIGILL); break; #endif /* CONFIG_MIPS_FP_SUPPORT */ @@ -1464,7 +1464,7 @@ asmlinkage void do_msa_fpe(struct pt_regs *regs, unsigned int msacsr) local_irq_enable(); die_if_kernel("do_msa_fpe invoked from kernel context!", regs); - force_sig(SIGFPE, current); + force_sig(SIGFPE); out: exception_exit(prev_state); } @@ -1477,7 +1477,7 @@ asmlinkage void do_msa(struct pt_regs *regs) prev_state = exception_enter(); if (!cpu_has_msa || test_thread_flag(TIF_32BIT_FPREGS)) { - force_sig(SIGILL, current); + force_sig(SIGILL); goto out; } @@ -1485,7 +1485,7 @@ asmlinkage void do_msa(struct pt_regs *regs) err = enable_restore_fp_context(1); if (err) - force_sig(SIGILL, current); + force_sig(SIGILL); out: exception_exit(prev_state); } @@ -1495,7 +1495,7 @@ asmlinkage void do_mdmx(struct pt_regs *regs) enum ctx_state prev_state; prev_state = exception_enter(); - force_sig(SIGILL, current); + force_sig(SIGILL); exception_exit(prev_state); } @@ -1521,7 +1521,7 @@ asmlinkage void do_watch(struct pt_regs *regs) if (test_tsk_thread_flag(current, TIF_LOAD_WATCH)) { mips_read_watch_registers(); local_irq_enable(); - force_sig_fault(SIGTRAP, TRAP_HWBKPT, NULL, current); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, NULL); } else { mips_clear_watch_registers(); local_irq_enable(); @@ -1592,7 +1592,7 @@ asmlinkage void do_mt(struct pt_regs *regs) } die_if_kernel("MIPS MT Thread exception in kernel", regs); - force_sig(SIGILL, current); + force_sig(SIGILL); } @@ -1601,7 +1601,7 @@ asmlinkage void do_dsp(struct pt_regs *regs) if (cpu_has_dsp) panic("Unexpected DSP exception"); - force_sig(SIGILL, current); + force_sig(SIGILL); } asmlinkage void do_reserved(struct pt_regs *regs) diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 76e33f940971..92bd2b0f0548 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -1365,20 +1365,20 @@ fault: return; die_if_kernel("Unhandled kernel unaligned access", regs); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; sigbus: die_if_kernel("Unhandled kernel unaligned access", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); return; sigill: die_if_kernel ("Unhandled kernel unaligned access or invalid instruction", regs); - force_sig(SIGILL, current); + force_sig(SIGILL); } /* Recode table from 16-bit register notation to 32-bit GPR. */ @@ -1991,20 +1991,20 @@ fault: return; die_if_kernel("Unhandled kernel unaligned access", regs); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; sigbus: die_if_kernel("Unhandled kernel unaligned access", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); return; sigill: die_if_kernel ("Unhandled kernel unaligned access or invalid instruction", regs); - force_sig(SIGILL, current); + force_sig(SIGILL); } static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr) @@ -2271,20 +2271,20 @@ fault: return; die_if_kernel("Unhandled kernel unaligned access", regs); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; sigbus: die_if_kernel("Unhandled kernel unaligned access", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); return; sigill: die_if_kernel ("Unhandled kernel unaligned access or invalid instruction", regs); - force_sig(SIGILL, current); + force_sig(SIGILL); } asmlinkage void do_ade(struct pt_regs *regs) @@ -2364,7 +2364,7 @@ asmlinkage void do_ade(struct pt_regs *regs) sigbus: die_if_kernel("Kernel unaligned instruction access", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); /* * XXX On return from the signal handler we should advance the epc diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 0369f26ab96d..2cfe839f0b3a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -123,9 +123,9 @@ int kvm_arch_hardware_setup(void) return 0; } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - *(int *)rtn = 0; + return 0; } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index f34d7ff5eb60..1e8d335025d7 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile @@ -7,7 +7,6 @@ obj-y += cache.o obj-y += context.o obj-y += extable.o obj-y += fault.o -obj-y += gup.o obj-y += init.o obj-y += mmap.o obj-y += page.o diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 3da216988672..33b409391ddb 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -62,8 +62,6 @@ void (*_dma_cache_wback_inv)(unsigned long start, unsigned long size); void (*_dma_cache_wback)(unsigned long start, unsigned long size); void (*_dma_cache_inv)(unsigned long start, unsigned long size); -EXPORT_SYMBOL(_dma_cache_wback_inv); - #endif /* CONFIG_DMA_NONCOHERENT */ /* diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index f9549d2fbea3..ed56c6fa7be2 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -44,33 +44,25 @@ static inline bool cpu_needs_post_dma_flush(struct device *dev) } } -void *arch_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +void arch_dma_prep_coherent(struct page *page, size_t size) { - void *ret; - - ret = dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); - if (ret && !(attrs & DMA_ATTR_NON_CONSISTENT)) { - dma_cache_wback_inv((unsigned long) ret, size); - ret = (void *)UNCAC_ADDR(ret); - } + dma_cache_wback_inv((unsigned long)page_address(page), size); +} - return ret; +void *uncached_kernel_address(void *addr) +{ + return (void *)(__pa(addr) + UNCAC_BASE); } -void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) +void *cached_kernel_address(void *addr) { - if (!(attrs & DMA_ATTR_NON_CONSISTENT)) - cpu_addr = (void *)CAC_ADDR((unsigned long)cpu_addr); - dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); + return __va(addr) - UNCAC_BASE; } long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, dma_addr_t dma_addr) { - unsigned long addr = CAC_ADDR((unsigned long)cpu_addr); - return page_to_pfn(virt_to_page((void *)addr)); + return page_to_pfn(virt_to_page(cached_kernel_address(cpu_addr))); } pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 73d8a0f0b810..f589aa8f47d9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -223,7 +223,7 @@ bad_area_nosemaphore: pr_cont("\n"); } current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f; - force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } @@ -279,7 +279,7 @@ do_sigbus: #endif current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f; tsk->thread.cp0_badvaddr = address; - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); return; #ifndef CONFIG_64BIT diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c deleted file mode 100644 index 4c2b4483683c..000000000000 --- a/arch/mips/mm/gup.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for MIPS - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - * Copyright (C) 2011 Ralf Baechle - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/hugetlb.h> - -#include <asm/cpu-features.h> -#include <asm/pgtable.h> - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#else - return READ_ONCE(*ptep); -#endif -} - -static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t *ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - if (!pte_present(pte) || - pte_special(pte) || (write && !pte_write(pte))) { - pte_unmap(ptep); - return 0; - } - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - pte_unmap(ptep - 1); - return 1; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON(page != compound_head(page)); - VM_BUG_ON(page_count(page) == 0); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t pte = *(pte_t *)&pmd; - struct page *head, *page; - int refs; - - if (write && !pte_write(pte)) - return 0; - /* hugepages are never "special" */ - VM_BUG_ON(pte_special(pte)); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - get_head_page_multiple(head, refs); - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_huge(pmd))) { - if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages,nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t pte = *(pte_t *)&pud; - struct page *head, *page; - int refs; - - if (write && !pte_write(pte)) - return 0; - /* hugepages are never "special" */ - VM_BUG_ON(pte_special(pte)); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - get_head_page_multiple(head, refs); - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_huge(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages,nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages,nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch - * size will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int ret, nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start || cpu_has_dc_aliases) - goto slow_irqon; - - /* XXX: batch / limit 'nr' */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, - pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; -slow: - local_irq_enable(); - -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - pages, gup_flags); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - return ret; -} diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 50ee7213b432..d79f2b432318 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -203,7 +203,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) bool __virt_addr_valid(const volatile void *kaddr) { - unsigned long vaddr = (unsigned long)vaddr; + unsigned long vaddr = (unsigned long)kaddr; if ((vaddr < PAGE_OFFSET) || (vaddr >= MAP_BASE)) return false; diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 65b6e85447b1..144ceb0fba88 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -391,6 +391,7 @@ static struct work_registers build_get_work_registers(u32 **p) static void build_restore_work_registers(u32 **p) { if (scratch_reg >= 0) { + uasm_i_ehb(p); UASM_i_MFC0(p, 1, c0_kscratch(), scratch_reg); return; } @@ -668,10 +669,12 @@ static void build_restore_pagemask(u32 **p, struct uasm_reloc **r, uasm_i_mtc0(p, 0, C0_PAGEMASK); uasm_il_b(p, r, lid); } - if (scratch_reg >= 0) + if (scratch_reg >= 0) { + uasm_i_ehb(p); UASM_i_MFC0(p, 1, c0_kscratch(), scratch_reg); - else + } else { UASM_i_LW(p, 1, scratchpad_offset(0), 0); + } } else { /* Reset default page size */ if (PM_DEFAULT_MASK >> 16) { @@ -938,10 +941,12 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, uasm_i_jr(p, ptr); if (mode == refill_scratch) { - if (scratch_reg >= 0) + if (scratch_reg >= 0) { + uasm_i_ehb(p); UASM_i_MFC0(p, 1, c0_kscratch(), scratch_reg); - else + } else { UASM_i_LW(p, 1, scratchpad_offset(0), 0); + } } else { uasm_i_nop(p); } @@ -1258,6 +1263,7 @@ build_fast_tlb_refill_handler (u32 **p, struct uasm_label **l, UASM_i_MTC0(p, odd, C0_ENTRYLO1); /* load it */ if (c0_scratch_reg >= 0) { + uasm_i_ehb(p); UASM_i_MFC0(p, scratch, c0_kscratch(), c0_scratch_reg); build_tlb_write_entry(p, l, r, tlb_random); uasm_l_leave(l, *p); @@ -1603,15 +1609,17 @@ static void build_setup_pgd(void) uasm_i_dinsm(&p, a0, 0, 29, 64 - 29); uasm_l_tlbl_goaround1(&l, p); UASM_i_SLL(&p, a0, a0, 11); - uasm_i_jr(&p, 31); UASM_i_MTC0(&p, a0, C0_CONTEXT); + uasm_i_jr(&p, 31); + uasm_i_ehb(&p); } else { /* PGD in c0_KScratch */ - uasm_i_jr(&p, 31); if (cpu_has_ldpte) UASM_i_MTC0(&p, a0, C0_PWBASE); else UASM_i_MTC0(&p, a0, c0_kscratch(), pgd_reg); + uasm_i_jr(&p, 31); + uasm_i_ehb(&p); } #else #ifdef CONFIG_SMP @@ -1625,13 +1633,16 @@ static void build_setup_pgd(void) UASM_i_LA_mostly(&p, a2, pgdc); UASM_i_SW(&p, a0, uasm_rel_lo(pgdc), a2); #endif /* SMP */ - uasm_i_jr(&p, 31); /* if pgd_reg is allocated, save PGD also to scratch register */ - if (pgd_reg != -1) + if (pgd_reg != -1) { UASM_i_MTC0(&p, a0, c0_kscratch(), pgd_reg); - else + uasm_i_jr(&p, 31); + uasm_i_ehb(&p); + } else { + uasm_i_jr(&p, 31); uasm_i_nop(&p); + } #endif if (p >= (u32 *)tlbmiss_handler_setup_pgd_end) panic("tlbmiss_handler_setup_pgd space exceeded"); diff --git a/arch/mips/sgi-ip22/ip22-berr.c b/arch/mips/sgi-ip22/ip22-berr.c index 34bb9801d5ff..dc0110a607a5 100644 --- a/arch/mips/sgi-ip22/ip22-berr.c +++ b/arch/mips/sgi-ip22/ip22-berr.c @@ -98,7 +98,7 @@ void ip22_be_interrupt(int irq) field, regs->cp0_epc, field, regs->regs[31]); /* Assume it would be too dangerous to continue ... */ die_if_kernel("Oops", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } static int ip22_be_handler(struct pt_regs *regs, int is_fixup) diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c index 082541d33161..c0cf7baee36d 100644 --- a/arch/mips/sgi-ip22/ip28-berr.c +++ b/arch/mips/sgi-ip22/ip28-berr.c @@ -462,7 +462,7 @@ void ip22_be_interrupt(int irq) if (ip28_be_interrupt(regs) != MIPS_BE_DISCARD) { /* Assume it would be too dangerous to continue ... */ die_if_kernel("Oops", regs); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } else if (debug_be_interrupt) show_regs(regs); } diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c index 83efe03d5c60..73ad29b180fb 100644 --- a/arch/mips/sgi-ip27/ip27-berr.c +++ b/arch/mips/sgi-ip27/ip27-berr.c @@ -74,7 +74,7 @@ int ip27_be_handler(struct pt_regs *regs, int is_fixup) show_regs(regs); dump_tlb_all(); while(1); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } void __init ip27_be_init(void) diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c index c1f12a9cf305..c860f95ab7ed 100644 --- a/arch/mips/sgi-ip32/ip32-berr.c +++ b/arch/mips/sgi-ip32/ip32-berr.c @@ -29,7 +29,7 @@ static int ip32_be_handler(struct pt_regs *regs, int is_fixup) show_regs(regs); dump_tlb_all(); while(1); - force_sig(SIGBUS, current); + force_sig(SIGBUS); } void __init ip32_be_init(void) diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 3299e287a477..fbd68329737f 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -1,18 +1,20 @@ # SPDX-License-Identifier: GPL-2.0-only # # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # config NDS32 def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_WANT_FRAME_POINTERS if FTRACE select CLKSRC_MMIO select CLONE_BACKWARDS select COMMON_CLK + select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index 3cbc749c79aa..e78b43d8389f 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -9,6 +9,9 @@ #include <asm/tlbflush.h> #include <asm/proc-fns.h> +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + /* * Since we have only two-level page tables, these are trivial */ @@ -22,22 +25,11 @@ extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); #define check_pgt_cache() do { } while (0) -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; - - pte = - (pte_t *) __get_free_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | - __GFP_ZERO); - - return pte; -} - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; - pte = alloc_pages(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO, 0); + pte = __pte_alloc_one(mm, GFP_PGTABLE_USER); if (pte) cpu_dcache_wb_page((unsigned long)page_address(pte)); @@ -45,21 +37,6 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) } /* - * Free one PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t * pte) -{ - if (pte) { - free_page((unsigned long)pte); - } -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - __free_page(pte); -} - -/* * Populate the pmdp entry with a pointer to the pte. This pmd is part * of the mm address space. * diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c index d0dbd4fe9645..490e3720d694 100644 --- a/arch/nds32/kernel/dma.c +++ b/arch/nds32/kernel/dma.c @@ -3,327 +3,13 @@ #include <linux/types.h> #include <linux/mm.h> -#include <linux/string.h> #include <linux/dma-noncoherent.h> -#include <linux/io.h> #include <linux/cache.h> #include <linux/highmem.h> -#include <linux/slab.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/proc-fns.h> -/* - * This is the page table (2MB) covering uncached, DMA consistent allocations - */ -static pte_t *consistent_pte; -static DEFINE_RAW_SPINLOCK(consistent_lock); - -/* - * VM region handling support. - * - * This should become something generic, handling VM region allocations for - * vmalloc and similar (ioremap, module space, etc). - * - * I envisage vmalloc()'s supporting vm_struct becoming: - * - * struct vm_struct { - * struct vm_region region; - * unsigned long flags; - * struct page **pages; - * unsigned int nr_pages; - * unsigned long phys_addr; - * }; - * - * get_vm_area() would then call vm_region_alloc with an appropriate - * struct vm_region head (eg): - * - * struct vm_region vmalloc_head = { - * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), - * .vm_start = VMALLOC_START, - * .vm_end = VMALLOC_END, - * }; - * - * However, vmalloc_head.vm_start is variable (typically, it is dependent on - * the amount of RAM found at boot time.) I would imagine that get_vm_area() - * would have to initialise this each time prior to calling vm_region_alloc(). - */ -struct arch_vm_region { - struct list_head vm_list; - unsigned long vm_start; - unsigned long vm_end; - struct page *vm_pages; -}; - -static struct arch_vm_region consistent_head = { - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_start = CONSISTENT_BASE, - .vm_end = CONSISTENT_END, -}; - -static struct arch_vm_region *vm_region_alloc(struct arch_vm_region *head, - size_t size, int gfp) -{ - unsigned long addr = head->vm_start, end = head->vm_end - size; - unsigned long flags; - struct arch_vm_region *c, *new; - - new = kmalloc(sizeof(struct arch_vm_region), gfp); - if (!new) - goto out; - - raw_spin_lock_irqsave(&consistent_lock, flags); - - list_for_each_entry(c, &head->vm_list, vm_list) { - if ((addr + size) < addr) - goto nospc; - if ((addr + size) <= c->vm_start) - goto found; - addr = c->vm_end; - if (addr > end) - goto nospc; - } - -found: - /* - * Insert this entry _before_ the one we found. - */ - list_add_tail(&new->vm_list, &c->vm_list); - new->vm_start = addr; - new->vm_end = addr + size; - - raw_spin_unlock_irqrestore(&consistent_lock, flags); - return new; - -nospc: - raw_spin_unlock_irqrestore(&consistent_lock, flags); - kfree(new); -out: - return NULL; -} - -static struct arch_vm_region *vm_region_find(struct arch_vm_region *head, - unsigned long addr) -{ - struct arch_vm_region *c; - - list_for_each_entry(c, &head->vm_list, vm_list) { - if (c->vm_start == addr) - goto out; - } - c = NULL; -out: - return c; -} - -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, - gfp_t gfp, unsigned long attrs) -{ - struct page *page; - struct arch_vm_region *c; - unsigned long order; - u64 mask = ~0ULL, limit; - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - - if (!consistent_pte) { - pr_err("%s: not initialized\n", __func__); - dump_stack(); - return NULL; - } - - if (dev) { - mask = dev->coherent_dma_mask; - - /* - * Sanity check the DMA mask - it must be non-zero, and - * must be able to be satisfied by a DMA allocation. - */ - if (mask == 0) { - dev_warn(dev, "coherent DMA mask is unset\n"); - goto no_page; - } - - } - - /* - * Sanity check the allocation size. - */ - size = PAGE_ALIGN(size); - limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || - size >= (CONSISTENT_END - CONSISTENT_BASE)) { - pr_warn("coherent allocation too big " - "(requested %#x mask %#llx)\n", size, mask); - goto no_page; - } - - order = get_order(size); - - if (mask != 0xffffffff) - gfp |= GFP_DMA; - - page = alloc_pages(gfp, order); - if (!page) - goto no_page; - - /* - * Invalidate any data that might be lurking in the - * kernel direct-mapped region for device DMA. - */ - { - unsigned long kaddr = (unsigned long)page_address(page); - memset(page_address(page), 0, size); - cpu_dma_wbinval_range(kaddr, kaddr + size); - } - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = vm_region_alloc(&consistent_head, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); - if (c) { - pte_t *pte = consistent_pte + CONSISTENT_OFFSET(c->vm_start); - struct page *end = page + (1 << order); - - c->vm_pages = page; - - /* - * Set the "dma handle" - */ - *handle = page_to_phys(page); - - do { - BUG_ON(!pte_none(*pte)); - - /* - * x86 does not mark the pages reserved... - */ - SetPageReserved(page); - set_pte(pte, mk_pte(page, prot)); - page++; - pte++; - } while (size -= PAGE_SIZE); - - /* - * Free the otherwise unused pages. - */ - while (page < end) { - __free_page(page); - page++; - } - - return (void *)c->vm_start; - } - - if (page) - __free_pages(page, order); -no_page: - *handle = ~0; - return NULL; -} - -void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t handle, unsigned long attrs) -{ - struct arch_vm_region *c; - unsigned long flags, addr; - pte_t *ptep; - - size = PAGE_ALIGN(size); - - raw_spin_lock_irqsave(&consistent_lock, flags); - - c = vm_region_find(&consistent_head, (unsigned long)cpu_addr); - if (!c) - goto no_area; - - if ((c->vm_end - c->vm_start) != size) { - pr_err("%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start); - addr = c->vm_start; - do { - pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); - unsigned long pfn; - - ptep++; - addr += PAGE_SIZE; - - if (!pte_none(pte) && pte_present(pte)) { - pfn = pte_pfn(pte); - - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - /* - * x86 does not mark the pages reserved... - */ - ClearPageReserved(page); - - __free_page(page); - continue; - } - } - - pr_crit("%s: bad page in kernel page table\n", __func__); - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - list_del(&c->vm_list); - - raw_spin_unlock_irqrestore(&consistent_lock, flags); - - kfree(c); - return; - -no_area: - raw_spin_unlock_irqrestore(&consistent_lock, flags); - pr_err("%s: trying to free invalid coherent area: %p\n", - __func__, cpu_addr); - dump_stack(); -} - -/* - * Initialise the consistent memory allocation. - */ -static int __init consistent_init(void) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int ret = 0; - - do { - pgd = pgd_offset(&init_mm, CONSISTENT_BASE); - pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); - if (!pmd) { - pr_err("%s: no pmd tables\n", __func__); - ret = -ENOMEM; - break; - } - /* The first level mapping may be created in somewhere. - * It's not necessary to warn here. */ - /* WARN_ON(!pmd_none(*pmd)); */ - - pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); - if (!pte) { - ret = -ENOMEM; - break; - } - - consistent_pte = pte; - } while (0); - - return ret; -} - -core_initcall(consistent_init); - static inline void cache_op(phys_addr_t paddr, size_t size, void (*fn)(unsigned long start, unsigned long end)) { @@ -389,3 +75,14 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, BUG(); } } + +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + cache_op(page_to_phys(page), size, cpu_dma_wbinval_range); +} + +static int __init atomic_pool_init(void) +{ + return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL)); +} +postcore_initcall(atomic_pool_init); diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c index cf0b8760f261..62bdafbc53f4 100644 --- a/arch/nds32/kernel/fpu.c +++ b/arch/nds32/kernel/fpu.c @@ -243,7 +243,7 @@ inline void handle_fpu_exception(struct pt_regs *regs) } force_sig_fault(si_signo, si_code, - (void __user *)instruction_pointer(regs), current); + (void __user *)instruction_pointer(regs)); done: own_fpu(); } diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c index 5f7660aa2d68..fe61513982b4 100644 --- a/arch/nds32/kernel/signal.c +++ b/arch/nds32/kernel/signal.c @@ -163,7 +163,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return regs->uregs[0]; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 5aa7c17da27a..f4d386b52622 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -205,7 +205,7 @@ int bad_syscall(int n, struct pt_regs *regs) } force_sig_fault(SIGILL, ILL_ILLTRP, - (void __user *)instruction_pointer(regs) - 4, current); + (void __user *)instruction_pointer(regs) - 4); die_if_kernel("Oops - bad syscall", regs, n); return regs->uregs[0]; } @@ -255,14 +255,15 @@ void __init early_trap_init(void) cpu_cache_wbinval_page(base, true); } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +static void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { + struct task_struct *tsk = current; + tsk->thread.trap_no = ENTRY_DEBUG_RELATED; tsk->thread.error_code = error_code; force_sig_fault(SIGTRAP, si_code, - (void __user *)instruction_pointer(regs), tsk); + (void __user *)instruction_pointer(regs)); } void do_debug_trap(unsigned long entry, unsigned long addr, @@ -274,7 +275,7 @@ void do_debug_trap(unsigned long entry, unsigned long addr, if (user_mode(regs)) { /* trap_signal */ - send_sigtrap(current, regs, 0, TRAP_BRKPT); + send_sigtrap(regs, 0, TRAP_BRKPT); } else { /* kernel_trap */ if (!fixup_exception(regs)) @@ -288,7 +289,7 @@ void unhandled_interruption(struct pt_regs *regs) show_regs(regs); if (!user_mode(regs)) do_exit(SIGKILL); - force_sig(SIGKILL, current); + force_sig(SIGKILL); } void unhandled_exceptions(unsigned long entry, unsigned long addr, @@ -299,7 +300,7 @@ void unhandled_exceptions(unsigned long entry, unsigned long addr, show_regs(regs); if (!user_mode(regs)) do_exit(SIGKILL); - force_sig(SIGKILL, current); + force_sig(SIGKILL); } extern int do_page_fault(unsigned long entry, unsigned long addr, @@ -326,7 +327,7 @@ void do_revinsn(struct pt_regs *regs) show_regs(regs); if (!user_mode(regs)) do_exit(SIGILL); - force_sig(SIGILL, current); + force_sig(SIGILL); } #ifdef CONFIG_ALIGNMENT_TRAP diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index 68d5f2a27f38..064ae5d2159d 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -271,7 +271,7 @@ bad_area_nosemaphore: tsk->thread.address = addr; tsk->thread.error_code = error_code; tsk->thread.trap_no = entry; - force_sig_fault(SIGSEGV, si_code, (void __user *)addr, tsk); + force_sig_fault(SIGSEGV, si_code, (void __user *)addr); return; } @@ -340,7 +340,7 @@ do_sigbus: tsk->thread.address = addr; tsk->thread.error_code = error_code; tsk->thread.trap_no = entry; - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr); return; diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 26a9c760a98b..44b5da37e8bd 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -4,6 +4,7 @@ config NIOS2 select ARCH_32BIT_OFF_T select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_UNCACHED_SEGMENT select ARCH_NO_SWAP select TIMER_OF select GENERIC_ATOMIC64 diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index f1fbdc47bdaf..79fcac61f6ef 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -101,12 +101,6 @@ static inline bool pfn_valid(unsigned long pfn) # define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -# define UNCAC_ADDR(addr) \ - ((void *)((unsigned)(addr) | CONFIG_NIOS2_IO_REGION_BASE)) -# define CAC_ADDR(addr) \ - ((void *)(((unsigned)(addr) & ~CONFIG_NIOS2_IO_REGION_BASE) | \ - CONFIG_NIOS2_KERNEL_REGION_BASE)) - #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 3a149ead1207..4bc8cf72067e 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -12,6 +12,8 @@ #include <linux/mm.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -37,41 +39,6 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ORDER); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; - - pte = (pte_t *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, PTE_ORDER); - - return pte; -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL, PTE_ORDER); - if (pte) { - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - clear_highpage(pte); - } - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_pages((unsigned long)pte, PTE_ORDER); -} - -static inline void pte_free(struct mm_struct *mm, struct page *pte) -{ - pgtable_page_dtor(pte); - __free_pages(pte, PTE_ORDER); -} - #define __pte_free_tlb(tlb, pte, addr) \ do { \ pgtable_page_dtor(pte); \ diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c index 4a81876b6086..a42dd09c6578 100644 --- a/arch/nios2/kernel/signal.c +++ b/arch/nios2/kernel/signal.c @@ -120,7 +120,7 @@ asmlinkage int do_rt_sigreturn(struct switch_stack *sw) return rval; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -211,7 +211,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, return 0; give_sigsegv: - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return -EFAULT; } diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 3bc3cd22b750..486db793923c 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(die_lock); static void _send_sig(int signo, int code, unsigned long addr) { - force_sig_fault(signo, code, (void __user *) addr, current); + force_sig_fault(signo, code, (void __user *) addr); } void die(const char *str, struct pt_regs *regs, long err) diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c index 4af9e5b5ba1c..9cb238664584 100644 --- a/arch/nios2/mm/dma-mapping.c +++ b/arch/nios2/mm/dma-mapping.c @@ -60,32 +60,28 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, } } -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) +void arch_dma_prep_coherent(struct page *page, size_t size) { - void *ret; + unsigned long start = (unsigned long)page_address(page); - /* optimized page clearing */ - gfp |= __GFP_ZERO; + flush_dcache_range(start, start + size); +} - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) - gfp |= GFP_DMA; +void *uncached_kernel_address(void *ptr) +{ + unsigned long addr = (unsigned long)ptr; - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret != NULL) { - *dma_handle = virt_to_phys(ret); - flush_dcache_range((unsigned long) ret, - (unsigned long) ret + size); - ret = UNCAC_ADDR(ret); - } + addr |= CONFIG_NIOS2_IO_REGION_BASE; - return ret; + return (void *)ptr; } -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) +void *cached_kernel_address(void *ptr) { - unsigned long addr = (unsigned long) CAC_ADDR((unsigned long) vaddr); + unsigned long addr = (unsigned long)ptr; + + addr &= ~CONFIG_NIOS2_IO_REGION_BASE; + addr |= CONFIG_NIOS2_KERNEL_REGION_BASE; - free_pages(addr, get_order(size)); + return (void *)ptr; } diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 7cfb20555b10..bf326f0edd2f 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # config OPENRISC diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 43e340c4cd9c..b41a79fcdbd9 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -94,15 +94,13 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, va = (unsigned long)page; - if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0) { - /* - * We need to iterate through the pages, clearing the dcache for - * them and setting the cache-inhibit bit. - */ - if (walk_page_range(va, va + size, &walk)) { - free_pages_exact(page, size); - return NULL; - } + /* + * We need to iterate through the pages, clearing the dcache for + * them and setting the cache-inhibit bit. + */ + if (walk_page_range(va, va + size, &walk)) { + free_pages_exact(page, size); + return NULL; } return (void *)va; @@ -118,10 +116,8 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr, .mm = &init_mm }; - if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0) { - /* walk_page_range shouldn't be able to fail here */ - WARN_ON(walk_page_range(va, va + size, &walk)); - } + /* walk_page_range shouldn't be able to fail here */ + WARN_ON(walk_page_range(va, va + size, &walk)); free_pages_exact(vaddr, size); } diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 801cad03a4c7..4f0754874d78 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -95,7 +95,7 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs) return regs->gpr[11]; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index e859bfb118a6..932a8ec2b520 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -244,7 +244,7 @@ void __init trap_init(void) asmlinkage void do_trap(struct pt_regs *regs, unsigned long address) { - force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)address, current); + force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)address); regs->pc += 4; } @@ -253,7 +253,7 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address) { if (user_mode(regs)) { /* Send a SIGBUS */ - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)address); } else { printk("KERNEL: Unaligned Access 0x%.8lx\n", address); show_registers(regs); @@ -266,7 +266,7 @@ asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address) { if (user_mode(regs)) { /* Send a SIGBUS */ - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } else { /* Kernel mode */ printk("KERNEL: Bus error (SIGBUS) 0x%.8lx\n", address); show_registers(regs); @@ -371,7 +371,7 @@ static inline void simulate_lwa(struct pt_regs *regs, unsigned long address, if (get_user(value, lwa_addr)) { if (user_mode(regs)) { - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; } @@ -418,7 +418,7 @@ static inline void simulate_swa(struct pt_regs *regs, unsigned long address, if (put_user(regs->gpr[rb], vaddr)) { if (user_mode(regs)) { - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; } @@ -461,7 +461,7 @@ asmlinkage void do_illegal_instruction(struct pt_regs *regs, if (user_mode(regs)) { /* Send a SIGILL */ - force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address, current); + force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)address); } else { /* Kernel mode */ printk("KERNEL: Illegal instruction (SIGILL) 0x%.8lx\n", address); diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 9eee5bf3db27..5d4d3a9691d0 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -209,7 +209,7 @@ bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { - force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } @@ -274,7 +274,7 @@ do_sigbus: * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4860efa91d7b..42875ff15671 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -59,6 +59,8 @@ config PARISC select HAVE_ARCH_KGDB select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_DYNAMIC_FTRACE if $(cc-option,-fpatchable-function-entry=1,1) + select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index c19af26febe6..58d46665cad9 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -47,6 +47,24 @@ ifneq ($(SUBARCH),$(UTS_MACHINE)) endif endif +ifdef CONFIG_DYNAMIC_FTRACE +ifdef CONFIG_64BIT +NOP_COUNT := 8 +else +NOP_COUNT := 5 +endif + +export CC_USING_RECORD_MCOUNT:=1 +export CC_USING_PATCHABLE_FUNCTION_ENTRY:=1 + +KBUILD_AFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY=1 +KBUILD_CFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY=1 \ + -DFTRACE_PATCHABLE_FUNCTION_SIZE=$(NOP_COUNT) + +CC_FLAGS_FTRACE := -fpatchable-function-entry=$(NOP_COUNT),$(shell echo $$(($(NOP_COUNT)-1))) +KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/parisc/kernel/module.lds +endif + OBJCOPY_FLAGS =-O binary -R .note -R .comment -S cflags-y := -pipe diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h index 42b2c75a1645..958c0aa5dbb2 100644 --- a/arch/parisc/include/asm/ftrace.h +++ b/arch/parisc/include/asm/ftrace.h @@ -5,12 +5,23 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -#define MCOUNT_INSN_SIZE 4 - +#define MCOUNT_ADDR ((unsigned long)mcount) +#define MCOUNT_INSN_SIZE 4 +#define CC_USING_NOP_MCOUNT extern unsigned long sys_call_table[]; extern unsigned long return_address(unsigned int); +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_caller(void); + +struct dyn_arch_ftrace { +}; + +unsigned long ftrace_call_adjust(unsigned long addr); + +#endif + #define ftrace_return_address(n) return_address(n) #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/patch.h b/arch/parisc/include/asm/patch.h index 685b58a13968..400d84c6e504 100644 --- a/arch/parisc/include/asm/patch.h +++ b/arch/parisc/include/asm/patch.h @@ -4,8 +4,10 @@ /* stop machine and patch kernel text */ void patch_text(void *addr, unsigned int insn); +void patch_text_multiple(void *addr, u32 *insn, unsigned int len); /* patch kernel text with machine already stopped (e.g. in kgdb) */ -void __patch_text(void *addr, unsigned int insn); +void __patch_text(void *addr, u32 insn); +void __patch_text_multiple(void *addr, u32 *insn, unsigned int len); #endif diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index ea75cc966dae..4f2059a50fae 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -10,6 +10,8 @@ #include <asm/cache.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + /* Allocate the top level pgd (page directory) * * Here (for 64 bit kernels) we implement a Hybrid L2/L3 scheme: we @@ -122,37 +124,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) pmd_populate_kernel(mm, pmd, page_address(pte_page)) #define pmd_pgtable(pmd) pmd_page(pmd) -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - struct page *page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, struct page *pte) -{ - pgtable_page_dtor(pte); - pte_free_kernel(mm, page_address(pte)); -} - #define check_pgt_cache() do { } while (0) #endif diff --git a/arch/parisc/include/asm/psw.h b/arch/parisc/include/asm/psw.h index 76c301146c31..46921ffcc407 100644 --- a/arch/parisc/include/asm/psw.h +++ b/arch/parisc/include/asm/psw.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PARISC_PSW_H - +#define _PARISC_PSW_H #define PSW_I 0x00000001 #define PSW_D 0x00000002 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 66c5dd245ac7..10173c32195e 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -114,6 +114,8 @@ #define SO_RCVTIMEO_NEW 0x4040 #define SO_SNDTIMEO_NEW 0x4041 +#define SO_DETACH_REUSEPORT_BPF 0x4042 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile index fc0df5c44468..c232266b517c 100644 --- a/arch/parisc/kernel/Makefile +++ b/arch/parisc/kernel/Makefile @@ -14,10 +14,11 @@ obj-y := cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \ ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities -CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_cache.o = -pg -CFLAGS_REMOVE_perf.o = -pg -CFLAGS_REMOVE_unwind.o = -pg +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_cache.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_perf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_unwind.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) endif obj-$(CONFIG_SMP) += smp.o diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 89c801c2b5d1..3e430590c1e1 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -2012,6 +2012,70 @@ ftrace_stub: #endif ENDPROC_CFI(mcount) +#ifdef CONFIG_DYNAMIC_FTRACE + +#ifdef CONFIG_64BIT +#define FTRACE_FRAME_SIZE (2*FRAME_SIZE) +#else +#define FTRACE_FRAME_SIZE FRAME_SIZE +#endif +ENTRY_CFI(ftrace_caller, caller,frame=FTRACE_FRAME_SIZE,CALLS,SAVE_RP,SAVE_SP) +ftrace_caller: + .global ftrace_caller + + STREG %r3, -FTRACE_FRAME_SIZE+1*REG_SZ(%sp) + ldo -FTRACE_FRAME_SIZE(%sp), %r3 + STREG %rp, -RP_OFFSET(%r3) + + /* Offset 0 is already allocated for %r1 */ + STREG %r23, 2*REG_SZ(%r3) + STREG %r24, 3*REG_SZ(%r3) + STREG %r25, 4*REG_SZ(%r3) + STREG %r26, 5*REG_SZ(%r3) + STREG %r28, 6*REG_SZ(%r3) + STREG %r29, 7*REG_SZ(%r3) +#ifdef CONFIG_64BIT + STREG %r19, 8*REG_SZ(%r3) + STREG %r20, 9*REG_SZ(%r3) + STREG %r21, 10*REG_SZ(%r3) + STREG %r22, 11*REG_SZ(%r3) + STREG %r27, 12*REG_SZ(%r3) + STREG %r31, 13*REG_SZ(%r3) + loadgp + ldo -16(%sp),%r29 +#endif + LDREG 0(%r3), %r25 + copy %rp, %r26 + ldo -8(%r25), %r25 + b,l ftrace_function_trampoline, %rp + copy %r3, %r24 + + LDREG -RP_OFFSET(%r3), %rp + LDREG 2*REG_SZ(%r3), %r23 + LDREG 3*REG_SZ(%r3), %r24 + LDREG 4*REG_SZ(%r3), %r25 + LDREG 5*REG_SZ(%r3), %r26 + LDREG 6*REG_SZ(%r3), %r28 + LDREG 7*REG_SZ(%r3), %r29 +#ifdef CONFIG_64BIT + LDREG 8*REG_SZ(%r3), %r19 + LDREG 9*REG_SZ(%r3), %r20 + LDREG 10*REG_SZ(%r3), %r21 + LDREG 11*REG_SZ(%r3), %r22 + LDREG 12*REG_SZ(%r3), %r27 + LDREG 13*REG_SZ(%r3), %r31 +#endif + LDREG 1*REG_SZ(%r3), %r3 + + LDREGM -FTRACE_FRAME_SIZE(%sp), %r1 + /* Adjust return point to jump back to beginning of traced function */ + ldo -4(%r1), %r1 + bv,n (%r1) + +ENDPROC_CFI(ftrace_caller) + +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER .align 8 ENTRY_CFI(return_to_handler, caller,frame=FRAME_SIZE) diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index a28f915993b1..d784ccdd8fef 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -7,17 +7,17 @@ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> * * future possible enhancements: - * - add CONFIG_DYNAMIC_FTRACE * - add CONFIG_STACK_TRACER */ #include <linux/init.h> #include <linux/ftrace.h> +#include <linux/uaccess.h> #include <asm/assembly.h> #include <asm/sections.h> #include <asm/ftrace.h> - +#include <asm/patch.h> #define __hot __attribute__ ((__section__ (".text.hot"))) @@ -50,13 +50,11 @@ void notrace __hot ftrace_function_trampoline(unsigned long parent, unsigned long self_addr, unsigned long org_sp_gr3) { - extern ftrace_func_t ftrace_trace_function; /* depends on CONFIG_DYNAMIC_FTRACE */ - - if (ftrace_trace_function != ftrace_stub) { - /* struct ftrace_ops *op, struct pt_regs *regs); */ - ftrace_trace_function(parent, self_addr, NULL, NULL); - return; - } +#ifndef CONFIG_DYNAMIC_FTRACE + extern ftrace_func_t ftrace_trace_function; +#endif + if (ftrace_trace_function != ftrace_stub) + ftrace_trace_function(self_addr, parent, NULL, NULL); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_return != (trace_func_graph_ret_t) ftrace_stub || @@ -75,3 +73,116 @@ void notrace __hot ftrace_function_trampoline(unsigned long parent, #endif } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +int ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + return 0; +} + +unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr+(FTRACE_PATCHABLE_FUNCTION_SIZE-1)*4; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + u32 insn[FTRACE_PATCHABLE_FUNCTION_SIZE]; + u32 *tramp; + int size, ret, i; + void *ip; + +#ifdef CONFIG_64BIT + unsigned long addr2 = + (unsigned long)dereference_function_descriptor((void *)addr); + + u32 ftrace_trampoline[] = { + 0x73c10208, /* std,ma r1,100(sp) */ + 0x0c2110c1, /* ldd -10(r1),r1 */ + 0xe820d002, /* bve,n (r1) */ + addr2 >> 32, + addr2 & 0xffffffff, + 0xe83f1fd7, /* b,l,n .-14,r1 */ + }; + + u32 ftrace_trampoline_unaligned[] = { + addr2 >> 32, + addr2 & 0xffffffff, + 0x37de0200, /* ldo 100(sp),sp */ + 0x73c13e01, /* std r1,-100(sp) */ + 0x34213ff9, /* ldo -4(r1),r1 */ + 0x50213fc1, /* ldd -20(r1),r1 */ + 0xe820d002, /* bve,n (r1) */ + 0xe83f1fcf, /* b,l,n .-20,r1 */ + }; + + BUILD_BUG_ON(ARRAY_SIZE(ftrace_trampoline_unaligned) > + FTRACE_PATCHABLE_FUNCTION_SIZE); +#else + u32 ftrace_trampoline[] = { + (u32)addr, + 0x6fc10080, /* stw,ma r1,40(sp) */ + 0x48213fd1, /* ldw -18(r1),r1 */ + 0xe820c002, /* bv,n r0(r1) */ + 0xe83f1fdf, /* b,l,n .-c,r1 */ + }; +#endif + + BUILD_BUG_ON(ARRAY_SIZE(ftrace_trampoline) > + FTRACE_PATCHABLE_FUNCTION_SIZE); + + size = sizeof(ftrace_trampoline); + tramp = ftrace_trampoline; + +#ifdef CONFIG_64BIT + if (rec->ip & 0x4) { + size = sizeof(ftrace_trampoline_unaligned); + tramp = ftrace_trampoline_unaligned; + } +#endif + + ip = (void *)(rec->ip + 4 - size); + + ret = probe_kernel_read(insn, ip, size); + if (ret) + return ret; + + for (i = 0; i < size / 4; i++) { + if (insn[i] != INSN_NOP) + return -EINVAL; + } + + __patch_text_multiple(ip, tramp, size); + return 0; +} + +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, + unsigned long addr) +{ + u32 insn[FTRACE_PATCHABLE_FUNCTION_SIZE]; + int i; + + for (i = 0; i < ARRAY_SIZE(insn); i++) + insn[i] = INSN_NOP; + + __patch_text_multiple((void *)rec->ip + 4 - sizeof(insn), + insn, sizeof(insn)); + return 0; +} +#endif diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 1f0f29a289d3..ac5f34993b53 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -33,9 +33,9 @@ * However, SEGREL32 is used only for PARISC unwind entries, and we want * those entries to have an absolute address, and not just an offset. * - * The unwind table mechanism has the ability to specify an offset for + * The unwind table mechanism has the ability to specify an offset for * the unwind table; however, because we split off the init functions into - * a different piece of memory, it is not possible to do this using a + * a different piece of memory, it is not possible to do this using a * single offset. Instead, we use the above hack for now. */ @@ -53,12 +53,6 @@ #include <asm/unwind.h> #include <asm/sections.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt...) -#endif - #define RELOC_REACHABLE(val, bits) \ (( ( !((val) & (1<<((bits)-1))) && ((val)>>(bits)) != 0 ) || \ ( ((val) & (1<<((bits)-1))) && ((val)>>(bits)) != (((__typeof__(val))(~0))>>((bits)+2)))) ? \ @@ -300,7 +294,7 @@ unsigned int arch_mod_section_prepend(struct module *mod, * sizeof(struct stub_entry); } -#define CONST +#define CONST int module_frob_arch_sections(CONST Elf_Ehdr *hdr, CONST Elf_Shdr *sechdrs, CONST char *secstrings, @@ -386,7 +380,7 @@ static Elf64_Word get_got(struct module *me, unsigned long value, long addend) got[i].addr = value; out: - DEBUGP("GOT ENTRY %d[%x] val %lx\n", i, i*sizeof(struct got_entry), + pr_debug("GOT ENTRY %d[%lx] val %lx\n", i, i*sizeof(struct got_entry), value); return i * sizeof(struct got_entry); } @@ -539,7 +533,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, //unsigned long dp = (unsigned long)$global$; register unsigned long dp asm ("r27"); - DEBUGP("Applying relocate section %u to %u\n", relsec, + pr_debug("Applying relocate section %u to %u\n", relsec, targetsec); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ @@ -563,7 +557,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, #if 0 #define r(t) ELF32_R_TYPE(rel[i].r_info)==t ? #t : - DEBUGP("Symbol %s loc 0x%x val 0x%x addend 0x%x: %s\n", + pr_debug("Symbol %s loc 0x%x val 0x%x addend 0x%x: %s\n", strtab + sym->st_name, (uint32_t)loc, val, addend, r(R_PARISC_PLABEL32) @@ -604,7 +598,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, /* See note about special handling of SEGREL32 at * the beginning of this file. */ - *loc = fsel(val, addend); + *loc = fsel(val, addend); break; case R_PARISC_SECREL32: /* 32-bit section relative address. */ @@ -683,7 +677,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, Elf_Addr loc0; unsigned int targetsec = sechdrs[relsec].sh_info; - DEBUGP("Applying relocate section %u to %u\n", relsec, + pr_debug("Applying relocate section %u to %u\n", relsec, targetsec); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ @@ -725,7 +719,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, case R_PARISC_LTOFF21L: /* LT-relative; left 21 bits */ val = get_got(me, val, addend); - DEBUGP("LTOFF21L Symbol %s loc %p val %lx\n", + pr_debug("LTOFF21L Symbol %s loc %p val %llx\n", strtab + sym->st_name, loc, val); val = lrsel(val, 0); @@ -736,14 +730,14 @@ int apply_relocate_add(Elf_Shdr *sechdrs, /* LT-relative; right 14 bits */ val = get_got(me, val, addend); val = rrsel(val, 0); - DEBUGP("LTOFF14R Symbol %s loc %p val %lx\n", + pr_debug("LTOFF14R Symbol %s loc %p val %llx\n", strtab + sym->st_name, loc, val); *loc = mask(*loc, 14) | reassemble_14(val); break; case R_PARISC_PCREL22F: /* PC-relative; 22 bits */ - DEBUGP("PCREL22F Symbol %s loc %p val %lx\n", + pr_debug("PCREL22F Symbol %s loc %p val %llx\n", strtab + sym->st_name, loc, val); val += addend; @@ -775,7 +769,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, val = get_stub(me, val, addend, ELF_STUB_GOT, loc0, targetsec); } - DEBUGP("STUB FOR %s loc %lx, val %lx+%lx at %lx\n", + pr_debug("STUB FOR %s loc %px, val %llx+%llx at %llx\n", strtab + sym->st_name, loc, sym->st_value, addend, val); val = (val - dot - 8)/4; @@ -799,7 +793,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, /* See note about special handling of SEGREL32 at * the beginning of this file. */ - *loc = fsel(val, addend); + *loc = fsel(val, addend); break; case R_PARISC_SECREL32: /* 32-bit section relative address. */ @@ -809,14 +803,14 @@ int apply_relocate_add(Elf_Shdr *sechdrs, /* 64-bit function address */ if(in_local(me, (void *)(val + addend))) { *loc64 = get_fdesc(me, val+addend); - DEBUGP("FDESC for %s at %p points to %lx\n", + pr_debug("FDESC for %s at %llx points to %llx\n", strtab + sym->st_name, *loc64, ((Elf_Fdesc *)*loc64)->addr); } else { /* if the symbol is not local to this * module then val+addend is a pointer * to the function descriptor */ - DEBUGP("Non local FPTR64 Symbol %s loc %p val %lx\n", + pr_debug("Non local FPTR64 Symbol %s loc %p val %llx\n", strtab + sym->st_name, loc, val); *loc64 = val + addend; @@ -847,7 +841,7 @@ register_unwind_table(struct module *me, end = table + sechdrs[me->arch.unwind_section].sh_size; gp = (Elf_Addr)me->core_layout.base + me->arch.got_offset; - DEBUGP("register_unwind_table(), sect = %d at 0x%p - 0x%p (gp=0x%lx)\n", + pr_debug("register_unwind_table(), sect = %d at 0x%p - 0x%p (gp=0x%lx)\n", me->arch.unwind_section, table, end, gp); me->arch.unwind = unwind_table_add(me->name, 0, gp, table, end); } @@ -868,6 +862,7 @@ int module_finalize(const Elf_Ehdr *hdr, const char *strtab = NULL; const Elf_Shdr *s; char *secstrings; + int err, symindex = -1; Elf_Sym *newptr, *oldptr; Elf_Shdr *symhdr = NULL; #ifdef DEBUG @@ -894,6 +889,7 @@ int module_finalize(const Elf_Ehdr *hdr, if(sechdrs[i].sh_type == SHT_SYMTAB && (sechdrs[i].sh_flags & SHF_ALLOC)) { int strindex = sechdrs[i].sh_link; + symindex = i; /* FIXME: AWFUL HACK * The cast is to drop the const from * the sechdrs pointer */ @@ -903,7 +899,7 @@ int module_finalize(const Elf_Ehdr *hdr, } } - DEBUGP("module %s: strtab %p, symhdr %p\n", + pr_debug("module %s: strtab %p, symhdr %p\n", me->name, strtab, symhdr); if(me->arch.got_count > MAX_GOTS) { @@ -922,7 +918,7 @@ int module_finalize(const Elf_Ehdr *hdr, oldptr = (void *)symhdr->sh_addr; newptr = oldptr + 1; /* we start counting at 1 */ nsyms = symhdr->sh_size / sizeof(Elf_Sym); - DEBUGP("OLD num_symtab %lu\n", nsyms); + pr_debug("OLD num_symtab %lu\n", nsyms); for (i = 1; i < nsyms; i++) { oldptr++; /* note, count starts at 1 so preincrement */ @@ -937,7 +933,7 @@ int module_finalize(const Elf_Ehdr *hdr, } nsyms = newptr - (Elf_Sym *)symhdr->sh_addr; - DEBUGP("NEW num_symtab %lu\n", nsyms); + pr_debug("NEW num_symtab %lu\n", nsyms); symhdr->sh_size = nsyms * sizeof(Elf_Sym); /* find .altinstructions section */ @@ -949,8 +945,24 @@ int module_finalize(const Elf_Ehdr *hdr, if (!strcmp(".altinstructions", secname)) /* patch .altinstructions */ apply_alternatives(aseg, aseg + s->sh_size, me->name); - } + /* For 32 bit kernels we're compiling modules with + * -ffunction-sections so we must relocate the addresses in the + *__mcount_loc section. + */ + if (symindex != -1 && !strcmp(secname, "__mcount_loc")) { + if (s->sh_type == SHT_REL) + err = apply_relocate((Elf_Shdr *)sechdrs, + strtab, symindex, + s - sechdrs, me); + else if (s->sh_type == SHT_RELA) + err = apply_relocate_add((Elf_Shdr *)sechdrs, + strtab, symindex, + s - sechdrs, me); + if (err) + return err; + } + } return 0; } diff --git a/arch/parisc/kernel/module.lds b/arch/parisc/kernel/module.lds new file mode 100644 index 000000000000..1a9a92aca5c8 --- /dev/null +++ b/arch/parisc/kernel/module.lds @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +SECTIONS { + __mcount_loc : { + *(__patchable_function_entries) + } +} diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index cdcd981278b3..80a0ab372802 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -17,15 +17,20 @@ struct patch { void *addr; - unsigned int insn; + u32 *insn; + unsigned int len; }; -static void __kprobes *patch_map(void *addr, int fixmap) +static DEFINE_RAW_SPINLOCK(patch_lock); + +static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags, + int *need_unmap) { unsigned long uintaddr = (uintptr_t) addr; bool module = !core_kernel_text(uintaddr); struct page *page; + *need_unmap = 0; if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) page = vmalloc_to_page(addr); else if (!module && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) @@ -33,36 +38,74 @@ static void __kprobes *patch_map(void *addr, int fixmap) else return addr; + *need_unmap = 1; set_fixmap(fixmap, page_to_phys(page)); + if (flags) + raw_spin_lock_irqsave(&patch_lock, *flags); + else + __acquire(&patch_lock); return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK)); } -static void __kprobes patch_unmap(int fixmap) +static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { clear_fixmap(fixmap); + + if (flags) + raw_spin_unlock_irqrestore(&patch_lock, *flags); + else + __release(&patch_lock); +} + +void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) +{ + unsigned long start = (unsigned long)addr; + unsigned long end = (unsigned long)addr + len; + unsigned long flags; + u32 *p, *fixmap; + int mapped; + + /* Make sure we don't have any aliases in cache */ + flush_kernel_vmap_range(addr, len); + flush_icache_range(start, end); + + p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped); + + while (len >= 4) { + *p++ = *insn++; + addr += sizeof(u32); + len -= sizeof(u32); + if (len && offset_in_page(addr) == 0) { + /* + * We're crossing a page boundary, so + * need to remap + */ + flush_kernel_vmap_range((void *)fixmap, + (p-fixmap) * sizeof(*p)); + if (mapped) + patch_unmap(FIX_TEXT_POKE0, &flags); + p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, + &mapped); + } + } + + flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p)); + if (mapped) + patch_unmap(FIX_TEXT_POKE0, &flags); + flush_icache_range(start, end); } -void __kprobes __patch_text(void *addr, unsigned int insn) +void __kprobes __patch_text(void *addr, u32 insn) { - void *waddr = addr; - int size; - - waddr = patch_map(addr, FIX_TEXT_POKE0); - *(u32 *)waddr = insn; - size = sizeof(u32); - flush_kernel_vmap_range(waddr, size); - patch_unmap(FIX_TEXT_POKE0); - flush_icache_range((uintptr_t)(addr), - (uintptr_t)(addr) + size); + __patch_text_multiple(addr, &insn, sizeof(insn)); } static int __kprobes patch_text_stop_machine(void *data) { struct patch *patch = data; - __patch_text(patch->addr, patch->insn); - + __patch_text_multiple(patch->addr, patch->insn, patch->len); return 0; } @@ -70,7 +113,20 @@ void __kprobes patch_text(void *addr, unsigned int insn) { struct patch patch = { .addr = addr, + .insn = &insn, + .len = sizeof(insn), + }; + + stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL); +} + +void __kprobes patch_text_multiple(void *addr, u32 *insn, unsigned int len) +{ + + struct patch patch = { + .addr = addr, .insn = insn, + .len = len }; stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL); diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 239162355b58..ca35d9a76e50 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -394,17 +394,20 @@ pcxl_dma_init(void) __initcall(pcxl_dma_init); -static void *pcxl_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) +void *arch_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { unsigned long vaddr; unsigned long paddr; int order; + if (boot_cpu_data.cpu_type != pcxl2 && boot_cpu_data.cpu_type != pcxl) + return NULL; + order = get_order(size); size = 1 << (order + PAGE_SHIFT); vaddr = pcxl_alloc_range(size); - paddr = __get_free_pages(flag | __GFP_ZERO, order); + paddr = __get_free_pages(gfp | __GFP_ZERO, order); flush_kernel_dcache_range(paddr, size); paddr = __pa(paddr); map_uncached_pages(vaddr, size, paddr); @@ -421,44 +424,19 @@ static void *pcxl_dma_alloc(struct device *dev, size_t size, return (void *)vaddr; } -static void *pcx_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) -{ - void *addr; - - if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0) - return NULL; - - addr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size)); - if (addr) - *dma_handle = (dma_addr_t)virt_to_phys(addr); - - return addr; -} - -void *arch_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) -{ - - if (boot_cpu_data.cpu_type == pcxl2 || boot_cpu_data.cpu_type == pcxl) - return pcxl_dma_alloc(dev, size, dma_handle, gfp, attrs); - else - return pcx_dma_alloc(dev, size, dma_handle, gfp, attrs); -} - void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { int order = get_order(size); - if (boot_cpu_data.cpu_type == pcxl2 || boot_cpu_data.cpu_type == pcxl) { - size = 1 << (order + PAGE_SHIFT); - unmap_uncached_pages((unsigned long)vaddr, size); - pcxl_free_range((unsigned long)vaddr, size); + WARN_ON_ONCE(boot_cpu_data.cpu_type != pcxl2 && + boot_cpu_data.cpu_type != pcxl); - vaddr = __va(dma_handle); - } - free_pages((unsigned long)vaddr, get_order(size)); + size = 1 << (order + PAGE_SHIFT); + unmap_uncached_pages((unsigned long)vaddr, size); + pcxl_free_range((unsigned long)vaddr, size); + + free_pages((unsigned long)__va(dma_handle), order); } void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index a3d2fb4e6dd2..f642ba378ffa 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -88,9 +88,9 @@ void user_enable_single_step(struct task_struct *task) ptrace_disable(task); /* Don't wake up the task, but let the parent know something happened. */ - force_sig_fault(SIGTRAP, TRAP_TRACE, - (void __user *) (task_regs(task)->iaoq[0] & ~3), - task); + force_sig_fault_to_task(SIGTRAP, TRAP_TRACE, + (void __user *) (task_regs(task)->iaoq[0] & ~3), + task); /* notify_parent(task, SIGCHLD); */ return; } diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 848c1934680b..02895a8f2c55 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -164,7 +164,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall) give_sigsegv: DBG(1,"sys_rt_sigreturn: Sending SIGSEGV\n"); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return; } diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index c9e377d59232..5022b9e179c2 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -430,3 +430,4 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 096e319adeb3..58dcf445e32f 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -275,7 +275,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) static void handle_gdb_break(struct pt_regs *regs, int wot) { force_sig_fault(SIGTRAP, wot, - (void __user *) (regs->iaoq[0] & ~3), current); + (void __user *) (regs->iaoq[0] & ~3)); } static void handle_break(struct pt_regs *regs) @@ -609,13 +609,13 @@ void notrace handle_interruption(int code, struct pt_regs *regs) si_code = ILL_PRVREG; give_sigill: force_sig_fault(SIGILL, si_code, - (void __user *) regs->iaoq[0], current); + (void __user *) regs->iaoq[0]); return; case 12: /* Overflow Trap, let the userland signal handler do the cleanup */ force_sig_fault(SIGFPE, FPE_INTOVF, - (void __user *) regs->iaoq[0], current); + (void __user *) regs->iaoq[0]); return; case 13: @@ -627,7 +627,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs) * to by si_addr. */ force_sig_fault(SIGFPE, FPE_CONDTRAP, - (void __user *) regs->iaoq[0], current); + (void __user *) regs->iaoq[0]); return; } /* The kernel doesn't want to handle condition codes */ @@ -739,7 +739,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs) force_sig_fault(SIGSEGV, SEGV_MAPERR, (code == 7)? ((void __user *) regs->iaoq[0]) : - ((void __user *) regs->ior), current); + ((void __user *) regs->ior)); return; case 28: @@ -754,7 +754,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs) task_pid_nr(current), current->comm); /* SIGBUS, for lack of a better one. */ force_sig_fault(SIGBUS, BUS_OBJERR, - (void __user *)regs->ior, current); + (void __user *)regs->ior); return; } pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC); @@ -770,7 +770,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs) code, fault_space, task_pid_nr(current), current->comm); force_sig_fault(SIGSEGV, SEGV_MAPERR, - (void __user *)regs->ior, current); + (void __user *)regs->ior); return; } } diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index 30161b7c9ac2..237d20dd5622 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -676,14 +676,14 @@ void handle_unaligned(struct pt_regs *regs) if (ret == ERR_PAGEFAULT) { force_sig_fault(SIGSEGV, SEGV_MAPERR, - (void __user *)regs->ior, current); + (void __user *)regs->ior); } else { force_sigbus: /* couldn't handle it ... */ force_sig_fault(SIGBUS, BUS_ADRALN, - (void __user *)regs->ior, current); + (void __user *)regs->ior); } return; diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index cd33b4feacb1..99cd24f2ea01 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -18,6 +18,8 @@ *(.data..vm0.pgd) \ *(.data..vm0.pte) +#define CC_USING_PATCHABLE_FUNCTION_ENTRY + #include <asm-generic/vmlinux.lds.h> /* needed for the processor specific cache alignment size */ diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c index c83237c0cbc1..6ce427b58836 100644 --- a/arch/parisc/math-emu/driver.c +++ b/arch/parisc/math-emu/driver.c @@ -104,7 +104,7 @@ handle_fpe(struct pt_regs *regs) memcpy(regs->fr, frcopy, sizeof regs->fr); if (signalcode != 0) { force_sig_fault(signalcode >> 24, signalcode & 0xffffff, - (void __user *) regs->iaoq[0], current); + (void __user *) regs->iaoq[0]); return -1; } diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index c8e8b7c05558..6dd4669ce7a5 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -403,13 +403,13 @@ bad_area: lsb = PAGE_SHIFT; force_sig_mceerr(BUS_MCEERR_AR, (void __user *) address, - lsb, current); + lsb); return; } #endif show_signal_msg(regs, code, address, tsk, vma); - force_sig_fault(signo, si_code, (void __user *) address, current); + force_sig_fault(signo, si_code, (void __user *) address); return; } diff --git a/arch/parisc/mm/fixmap.c b/arch/parisc/mm/fixmap.c index c8d41b54fb19..474cd241c150 100644 --- a/arch/parisc/mm/fixmap.c +++ b/arch/parisc/mm/fixmap.c @@ -10,7 +10,7 @@ #include <asm/cacheflush.h> #include <asm/fixmap.h> -void set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +void notrace set_fixmap(enum fixed_addresses idx, phys_addr_t phys) { unsigned long vaddr = __fix_to_virt(idx); pgd_t *pgd = pgd_offset_k(vaddr); @@ -28,13 +28,16 @@ void set_fixmap(enum fixed_addresses idx, phys_addr_t phys) flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); } -void clear_fixmap(enum fixed_addresses idx) +void notrace clear_fixmap(enum fixed_addresses idx) { unsigned long vaddr = __fix_to_virt(idx); pgd_t *pgd = pgd_offset_k(vaddr); pmd_t *pmd = pmd_offset(pgd, vaddr); pte_t *pte = pte_offset_kernel(pmd, vaddr); + if (WARN_ON(pte_none(*pte))) + return; + pte_clear(&init_mm, vaddr, pte); flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8c1c636308c8..24a41f919309 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -125,6 +125,7 @@ config PPC select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV + select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 @@ -185,12 +186,12 @@ config PPC select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL select HAVE_EBPF_JIT if PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) + select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC - select HAVE_GENERIC_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT @@ -898,7 +899,7 @@ config PPC_MEM_KEYS page-based protections, but without requiring modification of the page tables when an application changes protection domains. - For details, see Documentation/vm/protection-keys.rst + For details, see Documentation/core-api/protection-keys.rst If unsure, say y. diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 7c6baf6df139..aa51b9b66fa2 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -301,7 +301,6 @@ CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_IRDA=m CONFIG_IRLAN=m CONFIG_IRNET=m diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 52eafaf74054..31c231ea56b7 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -297,24 +297,24 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v) #define ATOMIC64_INIT(i) { (i) } -static __inline__ long atomic64_read(const atomic64_t *v) +static __inline__ s64 atomic64_read(const atomic64_t *v) { - long t; + s64 t; __asm__ __volatile__("ld%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter)); return t; } -static __inline__ void atomic64_set(atomic64_t *v, long i) +static __inline__ void atomic64_set(atomic64_t *v, s64 i) { __asm__ __volatile__("std%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i)); } #define ATOMIC64_OP(op, asm_op) \ -static __inline__ void atomic64_##op(long a, atomic64_t *v) \ +static __inline__ void atomic64_##op(s64 a, atomic64_t *v) \ { \ - long t; \ + s64 t; \ \ __asm__ __volatile__( \ "1: ldarx %0,0,%3 # atomic64_" #op "\n" \ @@ -327,10 +327,10 @@ static __inline__ void atomic64_##op(long a, atomic64_t *v) \ } #define ATOMIC64_OP_RETURN_RELAXED(op, asm_op) \ -static inline long \ -atomic64_##op##_return_relaxed(long a, atomic64_t *v) \ +static inline s64 \ +atomic64_##op##_return_relaxed(s64 a, atomic64_t *v) \ { \ - long t; \ + s64 t; \ \ __asm__ __volatile__( \ "1: ldarx %0,0,%3 # atomic64_" #op "_return_relaxed\n" \ @@ -345,10 +345,10 @@ atomic64_##op##_return_relaxed(long a, atomic64_t *v) \ } #define ATOMIC64_FETCH_OP_RELAXED(op, asm_op) \ -static inline long \ -atomic64_fetch_##op##_relaxed(long a, atomic64_t *v) \ +static inline s64 \ +atomic64_fetch_##op##_relaxed(s64 a, atomic64_t *v) \ { \ - long res, t; \ + s64 res, t; \ \ __asm__ __volatile__( \ "1: ldarx %0,0,%4 # atomic64_fetch_" #op "_relaxed\n" \ @@ -396,7 +396,7 @@ ATOMIC64_OPS(xor, xor) static __inline__ void atomic64_inc(atomic64_t *v) { - long t; + s64 t; __asm__ __volatile__( "1: ldarx %0,0,%2 # atomic64_inc\n\ @@ -409,9 +409,9 @@ static __inline__ void atomic64_inc(atomic64_t *v) } #define atomic64_inc atomic64_inc -static __inline__ long atomic64_inc_return_relaxed(atomic64_t *v) +static __inline__ s64 atomic64_inc_return_relaxed(atomic64_t *v) { - long t; + s64 t; __asm__ __volatile__( "1: ldarx %0,0,%2 # atomic64_inc_return_relaxed\n" @@ -427,7 +427,7 @@ static __inline__ long atomic64_inc_return_relaxed(atomic64_t *v) static __inline__ void atomic64_dec(atomic64_t *v) { - long t; + s64 t; __asm__ __volatile__( "1: ldarx %0,0,%2 # atomic64_dec\n\ @@ -440,9 +440,9 @@ static __inline__ void atomic64_dec(atomic64_t *v) } #define atomic64_dec atomic64_dec -static __inline__ long atomic64_dec_return_relaxed(atomic64_t *v) +static __inline__ s64 atomic64_dec_return_relaxed(atomic64_t *v) { - long t; + s64 t; __asm__ __volatile__( "1: ldarx %0,0,%2 # atomic64_dec_return_relaxed\n" @@ -463,9 +463,9 @@ static __inline__ long atomic64_dec_return_relaxed(atomic64_t *v) * Atomically test *v and decrement if it is greater than 0. * The function returns the old value of *v minus 1. */ -static __inline__ long atomic64_dec_if_positive(atomic64_t *v) +static __inline__ s64 atomic64_dec_if_positive(atomic64_t *v) { - long t; + s64 t; __asm__ __volatile__( PPC_ATOMIC_ENTRY_BARRIER @@ -502,9 +502,9 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ long atomic64_fetch_add_unless(atomic64_t *v, long a, long u) +static __inline__ s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { - long t; + s64 t; __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER @@ -534,7 +534,7 @@ static __inline__ long atomic64_fetch_add_unless(atomic64_t *v, long a, long u) */ static __inline__ int atomic64_inc_not_zero(atomic64_t *v) { - long t1, t2; + s64 t1, t2; __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3f53be60fb01..64145751b2fd 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -140,6 +140,20 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p) } #endif +#ifdef CONFIG_PPC64 +#define is_ioremap_addr is_ioremap_addr +static inline bool is_ioremap_addr(const void *x) +{ +#ifdef CONFIG_MMU + unsigned long addr = (unsigned long)x; + + return addr >= IOREMAP_BASE && addr < IOREMAP_END; +#else + return false; +#endif +} +#endif /* CONFIG_PPC64 */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index ef573fe9873e..a9993e7a443b 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -346,8 +346,6 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #define spin_cpu_relax() barrier() -#define spin_cpu_yield() spin_cpu_relax() - #define spin_end() HMT_medium() #define spin_until_cond(cond) \ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f0fbbf6a6a1f..b448b0938299 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -639,7 +639,7 @@ void do_break (struct pt_regs *regs, unsigned long address, hw_breakpoint_disable(); /* Deliver the signal to userspace */ - force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address, current); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address); } #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 684b0b315c32..8c92febf5f44 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -2521,7 +2521,6 @@ void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ user_disable_single_step(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); } #ifdef CONFIG_PPC_ADV_DEBUG_REGS diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index b824f4c69622..0ab4c72515c4 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -990,8 +990,7 @@ int rtas_ibm_suspend_me(u64 handle) /* Call function on all CPUs. One of us will make the * rtas call */ - if (on_each_cpu(rtas_percpu_suspend_me, &data, 0)) - atomic_set(&data.error, -EINVAL); + on_each_cpu(rtas_percpu_suspend_me, &data, 0); wait_for_completion(&done); diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index a2b74e057904..f50b708d6d77 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1245,7 +1245,7 @@ SYSCALL_DEFINE0(rt_sigreturn) current->comm, current->pid, rt_sf, regs->nip, regs->link); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -1334,7 +1334,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, current->comm, current->pid, ctx, regs->nip, regs->link); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); goto out; } @@ -1512,6 +1512,6 @@ badframe: current->comm, current->pid, addr, regs->nip, regs->link); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 4292ea39baa4..2f80e270c7b0 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -808,7 +808,7 @@ badframe: current->comm, current->pid, "rt_sigreturn", (long)uc, regs->nip, regs->link); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c index c612d50c9d18..b84992c10854 100644 --- a/arch/powerpc/kernel/suspend.c +++ b/arch/powerpc/kernel/suspend.c @@ -7,6 +7,7 @@ */ #include <linux/mm.h> +#include <linux/suspend.h> #include <asm/page.h> #include <asm/sections.h> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 103655d84b4b..f2c3bda2d39f 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -515,3 +515,4 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 47df30982de1..11caa0291254 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -297,7 +297,7 @@ NOKPROBE_SYMBOL(die); void user_single_step_report(struct pt_regs *regs) { - force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)regs->nip, current); + force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)regs->nip); } static void show_signal_msg(int signr, struct pt_regs *regs, int code, @@ -363,7 +363,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) if (!exception_common(signr, regs, code, addr)) return; - force_sig_fault(signr, code, (void __user *)addr, current); + force_sig_fault(signr, code, (void __user *)addr); } /* diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index e8276161872e..381bf8dea193 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -827,7 +827,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) * * Note: If EOI is incorrectly used by SW to lower the CPPR * value (ie more favored), we do not check for rejection of - * a pending interrupt, this is a SW error and PAPR sepcifies + * a pending interrupt, this is a SW error and PAPR specifies * that we don't have to deal with it. * * The sending of an EOI to the ICS is handled after the diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6d704ad2472b..0dba7eb24f92 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -414,9 +414,9 @@ int kvm_arch_hardware_setup(void) return 0; } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - *(int *)rtn = kvmppc_core_check_processor_compat(); + return kvmppc_core_check_processor_compat(); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index bb9835681315..ce8a77fae6a7 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -666,6 +666,11 @@ EXPORT_SYMBOL(radix__flush_tlb_page); #define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ +/* + * If kernel TLBIs ever become local rather than global, then + * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it + * assumes kernel TLBIs are global. + */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { _tlbie_pid(0, RIC_FLUSH_ALL); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index ec6b7ad70659..d989592b6fc8 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -178,13 +178,12 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; - force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, - current); + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return 0; } #endif - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b5d92dc32844..51716c11d0fb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -511,13 +511,6 @@ retry: return page; } -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - #ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -665,68 +658,3 @@ void flush_dcache_icache_hugepage(struct page *page) } } } - -static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long pte_end; - struct page *head, *page; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = READ_ONCE(*ptep); - - if (!pte_access_permitted(pte, write)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, - unsigned long end, int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index c2ee6041f02c..02a59946a78a 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -500,6 +500,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, case BPF_ALU | BPF_LSH | BPF_X: /* (u32) dst <<= (u32) src */ /* slw clears top 32 bits */ PPC_SLW(dst_reg, dst_reg, src_reg); + /* skip zero extension move, but set address map. */ + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; case BPF_ALU64 | BPF_LSH | BPF_X: /* dst <<= src; */ PPC_SLD(dst_reg, dst_reg, src_reg); @@ -507,6 +510,8 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, case BPF_ALU | BPF_LSH | BPF_K: /* (u32) dst <<== (u32) imm */ /* with imm 0, we still need to clear top 32 bits */ PPC_SLWI(dst_reg, dst_reg, imm); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; case BPF_ALU64 | BPF_LSH | BPF_K: /* dst <<== imm */ if (imm != 0) @@ -514,12 +519,16 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, break; case BPF_ALU | BPF_RSH | BPF_X: /* (u32) dst >>= (u32) src */ PPC_SRW(dst_reg, dst_reg, src_reg); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; case BPF_ALU64 | BPF_RSH | BPF_X: /* dst >>= src */ PPC_SRD(dst_reg, dst_reg, src_reg); break; case BPF_ALU | BPF_RSH | BPF_K: /* (u32) dst >>= (u32) imm */ PPC_SRWI(dst_reg, dst_reg, imm); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; case BPF_ALU64 | BPF_RSH | BPF_K: /* dst >>= imm */ if (imm != 0) @@ -544,6 +553,11 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, */ case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */ case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ + if (imm == 1) { + /* special mov32 for zext */ + PPC_RLWINM(dst_reg, dst_reg, 0, 0, 31); + break; + } PPC_MR(dst_reg, src_reg); goto bpf_alu32_trunc; case BPF_ALU | BPF_MOV | BPF_K: /* (u32) dst = imm */ @@ -551,11 +565,13 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, PPC_LI32(dst_reg, imm); if (imm < 0) goto bpf_alu32_trunc; + else if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; bpf_alu32_trunc: /* Truncate to 32-bits */ - if (BPF_CLASS(code) == BPF_ALU) + if (BPF_CLASS(code) == BPF_ALU && !fp->aux->verifier_zext) PPC_RLWINM(dst_reg, dst_reg, 0, 0, 31); break; @@ -614,10 +630,13 @@ emit_clear: case 16: /* zero-extend 16 bits into 64 bits */ PPC_RLDICL(dst_reg, dst_reg, 0, 48); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; case 32: - /* zero-extend 32 bits into 64 bits */ - PPC_RLDICL(dst_reg, dst_reg, 0, 32); + if (!fp->aux->verifier_zext) + /* zero-extend 32 bits into 64 bits */ + PPC_RLDICL(dst_reg, dst_reg, 0, 32); break; case 64: /* nop */ @@ -694,14 +713,20 @@ emit_clear: /* dst = *(u8 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_B: PPC_LBZ(dst_reg, src_reg, off); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; /* dst = *(u16 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_H: PPC_LHZ(dst_reg, src_reg, off); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; /* dst = *(u32 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_W: PPC_LWZ(dst_reg, src_reg, off); + if (insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_DW: @@ -1042,6 +1067,11 @@ struct powerpc64_jit_data { struct codegen_context ctx; }; +bool bpf_jit_needs_zext(void) +{ + return true; +} + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { u32 proglen; diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index 6dfd2cb1bce7..24adbe3c605c 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -31,22 +31,21 @@ static void spufs_handle_event(struct spu_context *ctx, switch (type) { case SPE_EVENT_INVALID_DMA: - force_sig_fault(SIGBUS, BUS_OBJERR, NULL, current); + force_sig_fault(SIGBUS, BUS_OBJERR, NULL); break; case SPE_EVENT_SPE_DATA_STORAGE: ctx->ops->restart_dma(ctx); - force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea, - current); + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea); break; case SPE_EVENT_DMA_ALIGNMENT: /* DAR isn't set for an alignment fault :( */ - force_sig_fault(SIGBUS, BUS_ADRALN, NULL, current); + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); break; case SPE_EVENT_SPE_ERROR: force_sig_fault( SIGILL, ILL_ILLOPC, (void __user *)(unsigned long) - ctx->ops->npc_read(ctx) - 4, current); + ctx->ops->npc_read(ctx) - 4); break; } } diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c index 07f82d7395ff..3f2380f40f99 100644 --- a/arch/powerpc/platforms/cell/spufs/run.c +++ b/arch/powerpc/platforms/cell/spufs/run.c @@ -443,7 +443,7 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event) else if (unlikely((status & SPU_STATUS_STOPPED_BY_STOP) && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff)) { - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); ret = -ERESTARTSYS; } diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index e56b553de27b..f18d5067cd0f 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx) * runqueue. The context will be rescheduled on the proper node * if it is timesliced or preempted. */ - cpumask_copy(&ctx->cpus_allowed, ¤t->cpus_allowed); + cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr); /* Save the current cpu id for spu interrupt routing. */ ctx->last_ran = raw_smp_processor_id(); diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index 84e8ec4011ba..b91eb0929ed1 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -147,13 +147,13 @@ static const struct dma_map_ops ibmebus_dma_ops = { .unmap_page = ibmebus_unmap_page, }; -static int ibmebus_match_path(struct device *dev, void *data) +static int ibmebus_match_path(struct device *dev, const void *data) { struct device_node *dn = to_platform_device(dev)->dev.of_node; return (of_find_node_by_path(data) == dn); } -static int ibmebus_match_node(struct device *dev, void *data) +static int ibmebus_match_node(struct device *dev, const void *data) { return to_platform_device(dev)->dev.of_node == data; } diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig index e0dbec780fe9..d23288c4abf6 100644 --- a/arch/powerpc/sysdev/Kconfig +++ b/arch/powerpc/sysdev/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # config PPC4xx_PCI_EXPRESS diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 0c4b12205632..13a1c0d04e9e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only # # For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. +# see Documentation/kbuild/kconfig-language.rst. # config 64BIT @@ -17,6 +17,7 @@ config RISCV select OF select OF_EARLY_FLATTREE select OF_IRQ + select ARCH_HAS_BINFMT_FLAT select ARCH_WANT_FRAME_POINTERS select CLONE_BACKWARDS select COMMON_CLK @@ -50,6 +51,7 @@ config RISCV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MMIOWB select HAVE_EBPF_JIT if 64BIT + select EDAC_SUPPORT config MMU def_bool y diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 5ee646619cc3..1efaeddf1e4b 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += compat.h generic-y += device.h generic-y += div64.h generic-y += extable.h +generic-y += flat.h generic-y += dma.h generic-y += dma-contiguous.h generic-y += dma-mapping.h diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 9038aeb900a6..96f95c9ebd97 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -38,11 +38,11 @@ static __always_inline void atomic_set(atomic_t *v, int i) #ifndef CONFIG_GENERIC_ATOMIC64 #define ATOMIC64_INIT(i) { (i) } -static __always_inline long atomic64_read(const atomic64_t *v) +static __always_inline s64 atomic64_read(const atomic64_t *v) { return READ_ONCE(v->counter); } -static __always_inline void atomic64_set(atomic64_t *v, long i) +static __always_inline void atomic64_set(atomic64_t *v, s64 i) { WRITE_ONCE(v->counter, i); } @@ -66,11 +66,11 @@ void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v) \ #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS(op, asm_op, I) \ - ATOMIC_OP (op, asm_op, I, w, int, ) + ATOMIC_OP (op, asm_op, I, w, int, ) #else #define ATOMIC_OPS(op, asm_op, I) \ - ATOMIC_OP (op, asm_op, I, w, int, ) \ - ATOMIC_OP (op, asm_op, I, d, long, 64) + ATOMIC_OP (op, asm_op, I, w, int, ) \ + ATOMIC_OP (op, asm_op, I, d, s64, 64) #endif ATOMIC_OPS(add, add, i) @@ -127,14 +127,14 @@ c_type atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS(op, asm_op, c_op, I) \ - ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ - ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) + ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ + ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) #else #define ATOMIC_OPS(op, asm_op, c_op, I) \ - ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ - ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) \ - ATOMIC_FETCH_OP( op, asm_op, I, d, long, 64) \ - ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, long, 64) + ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ + ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) \ + ATOMIC_FETCH_OP( op, asm_op, I, d, s64, 64) \ + ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, s64, 64) #endif ATOMIC_OPS(add, add, +, i) @@ -166,11 +166,11 @@ ATOMIC_OPS(sub, add, +, -i) #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS(op, asm_op, I) \ - ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) + ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) #else #define ATOMIC_OPS(op, asm_op, I) \ - ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) \ - ATOMIC_FETCH_OP(op, asm_op, I, d, long, 64) + ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) \ + ATOMIC_FETCH_OP(op, asm_op, I, d, s64, 64) #endif ATOMIC_OPS(and, and, i) @@ -219,9 +219,10 @@ static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) #define atomic_fetch_add_unless atomic_fetch_add_unless #ifndef CONFIG_GENERIC_ATOMIC64 -static __always_inline long atomic64_fetch_add_unless(atomic64_t *v, long a, long u) +static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { - long prev, rc; + s64 prev; + long rc; __asm__ __volatile__ ( "0: lr.d %[p], %[c]\n" @@ -290,11 +291,11 @@ c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n) \ #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS() \ - ATOMIC_OP( int, , 4) + ATOMIC_OP(int, , 4) #else #define ATOMIC_OPS() \ - ATOMIC_OP( int, , 4) \ - ATOMIC_OP(long, 64, 8) + ATOMIC_OP(int, , 4) \ + ATOMIC_OP(s64, 64, 8) #endif ATOMIC_OPS() @@ -332,9 +333,10 @@ static __always_inline int atomic_sub_if_positive(atomic_t *v, int offset) #define atomic_dec_if_positive(v) atomic_sub_if_positive(v, 1) #ifndef CONFIG_GENERIC_ATOMIC64 -static __always_inline long atomic64_sub_if_positive(atomic64_t *v, int offset) +static __always_inline s64 atomic64_sub_if_positive(atomic64_t *v, s64 offset) { - long prev, rc; + s64 prev; + long rc; __asm__ __volatile__ ( "0: lr.d %[p], %[c]\n" diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index f653bfc8a83b..07ceee8b1747 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -86,7 +86,7 @@ struct task_struct; extern void die(struct pt_regs *regs, const char *str); extern void do_trap(struct pt_regs *regs, int signo, int code, - unsigned long addr, struct task_struct *tsk); + unsigned long addr); #endif /* !__ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index eb8b0195f27f..56a67d66f72f 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -10,6 +10,8 @@ #include <linux/mm.h> #include <asm/tlb.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { @@ -74,33 +76,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #endif /* __PAGETABLE_PMD_FOLDED */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page( - GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO); -} - -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO); - if (likely(pte != NULL)) - pgtable_page_ctor(pte); - return pte; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - #define __pte_free_tlb(tlb, pte, buf) \ do { \ pgtable_page_dtor(pte); \ diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 1fe1b02e44d0..b14d7647d800 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -126,7 +126,7 @@ badframe: task->comm, task_pid_nr(task), __func__, frame, (void *)regs->sepc, (void *)regs->sp); } - force_sig(SIGSEGV, task); + force_sig(SIGSEGV); return 0; } diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 6b32190ba73c..424eb72d56b1 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -55,9 +55,10 @@ void die(struct pt_regs *regs, const char *str) do_exit(SIGSEGV); } -void do_trap(struct pt_regs *regs, int signo, int code, - unsigned long addr, struct task_struct *tsk) +void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) { + struct task_struct *tsk = current; + if (show_unhandled_signals && unhandled_signal(tsk, signo) && printk_ratelimit()) { pr_info("%s[%d]: unhandled signal %d code 0x%x at 0x" REG_FMT, @@ -67,14 +68,14 @@ void do_trap(struct pt_regs *regs, int signo, int code, show_regs(regs); } - force_sig_fault(signo, code, (void __user *)addr, tsk); + force_sig_fault(signo, code, (void __user *)addr); } static void do_trap_error(struct pt_regs *regs, int signo, int code, unsigned long addr, const char *str) { if (user_mode(regs)) { - do_trap(regs, signo, code, addr, current); + do_trap(regs, signo, code, addr); } else { if (!fixup_exception(regs)) die(regs, str); @@ -140,7 +141,7 @@ asmlinkage void do_trap_break(struct pt_regs *regs) } #endif /* CONFIG_GENERIC_BUG */ - force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)(regs->sepc), current); + force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)(regs->sepc)); } #ifdef CONFIG_GENERIC_BUG diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index f960c3f4ce47..96add1427a75 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -169,7 +169,7 @@ bad_area: up_read(&mm->mmap_sem); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { - do_trap(regs, SIGSEGV, code, addr, tsk); + do_trap(regs, SIGSEGV, code, addr); return; } @@ -205,7 +205,7 @@ do_sigbus: /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; - do_trap(regs, SIGBUS, BUS_ADRERR, addr, tsk); + do_trap(regs, SIGBUS, BUS_ADRERR, addr); return; vmalloc_fault: @@ -219,7 +219,7 @@ vmalloc_fault: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) - return do_trap(regs, SIGSEGV, code, addr, tsk); + return do_trap(regs, SIGSEGV, code, addr); /* * Synchronize this task's top level page-table diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c index 426d5c33ea90..5451ef3845f2 100644 --- a/arch/riscv/net/bpf_jit_comp.c +++ b/arch/riscv/net/bpf_jit_comp.c @@ -731,6 +731,7 @@ static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64 || BPF_CLASS(insn->code) == BPF_JMP; + struct bpf_prog_aux *aux = ctx->prog->aux; int rvoff, i = insn - ctx->prog->insnsi; u8 rd = -1, rs = -1, code = insn->code; s16 off = insn->off; @@ -742,8 +743,13 @@ static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU64 | BPF_MOV | BPF_X: + if (imm == 1) { + /* Special mov32 for zext */ + emit_zext_32(rd, ctx); + break; + } emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; @@ -751,49 +757,49 @@ static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU64 | BPF_ADD | BPF_X: emit(is64 ? rv_add(rd, rd, rs) : rv_addw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU64 | BPF_SUB | BPF_X: emit(is64 ? rv_sub(rd, rd, rs) : rv_subw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU64 | BPF_AND | BPF_X: emit(rv_and(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU64 | BPF_OR | BPF_X: emit(rv_or(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU64 | BPF_XOR | BPF_X: emit(rv_xor(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_X: emit(is64 ? rv_mul(rd, rd, rs) : rv_mulw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU64 | BPF_DIV | BPF_X: emit(is64 ? rv_divu(rd, rd, rs) : rv_divuw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_X: emit(is64 ? rv_remu(rd, rd, rs) : rv_remuw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_LSH | BPF_X: @@ -805,13 +811,13 @@ static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_RSH | BPF_X: emit(is64 ? rv_srl(rd, rd, rs) : rv_srlw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_ARSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: emit(is64 ? rv_sra(rd, rd, rs) : rv_sraw(rd, rd, rs), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; @@ -820,7 +826,7 @@ static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_ALU64 | BPF_NEG: emit(is64 ? rv_sub(rd, RV_REG_ZERO, rd) : rv_subw(rd, RV_REG_ZERO, rd), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; @@ -885,7 +891,7 @@ out_be: case BPF_ALU | BPF_MOV | BPF_K: case BPF_ALU64 | BPF_MOV | BPF_K: emit_imm(rd, imm, ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; @@ -900,7 +906,7 @@ out_be: emit(is64 ? rv_add(rd, rd, RV_REG_T1) : rv_addw(rd, rd, RV_REG_T1), ctx); } - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_SUB | BPF_K: @@ -913,7 +919,7 @@ out_be: emit(is64 ? rv_sub(rd, rd, RV_REG_T1) : rv_subw(rd, rd, RV_REG_T1), ctx); } - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_AND | BPF_K: @@ -924,7 +930,7 @@ out_be: emit_imm(RV_REG_T1, imm, ctx); emit(rv_and(rd, rd, RV_REG_T1), ctx); } - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_OR | BPF_K: @@ -935,7 +941,7 @@ out_be: emit_imm(RV_REG_T1, imm, ctx); emit(rv_or(rd, rd, RV_REG_T1), ctx); } - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_XOR | BPF_K: @@ -946,7 +952,7 @@ out_be: emit_imm(RV_REG_T1, imm, ctx); emit(rv_xor(rd, rd, RV_REG_T1), ctx); } - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_MUL | BPF_K: @@ -954,7 +960,7 @@ out_be: emit_imm(RV_REG_T1, imm, ctx); emit(is64 ? rv_mul(rd, rd, RV_REG_T1) : rv_mulw(rd, rd, RV_REG_T1), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_DIV | BPF_K: @@ -962,7 +968,7 @@ out_be: emit_imm(RV_REG_T1, imm, ctx); emit(is64 ? rv_divu(rd, rd, RV_REG_T1) : rv_divuw(rd, rd, RV_REG_T1), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_MOD | BPF_K: @@ -970,7 +976,7 @@ out_be: emit_imm(RV_REG_T1, imm, ctx); emit(is64 ? rv_remu(rd, rd, RV_REG_T1) : rv_remuw(rd, rd, RV_REG_T1), ctx); - if (!is64) + if (!is64 && !aux->verifier_zext) emit_zext_32(rd, ctx); break; case BPF_ALU | BPF_LSH | BPF_K: @@ -1263,6 +1269,8 @@ out_be: emit_imm(RV_REG_T1, off, ctx); emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx); emit(rv_lbu(rd, 0, RV_REG_T1), ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_LDX | BPF_MEM | BPF_H: if (is_12b_int(off)) { @@ -1273,6 +1281,8 @@ out_be: emit_imm(RV_REG_T1, off, ctx); emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx); emit(rv_lhu(rd, 0, RV_REG_T1), ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_LDX | BPF_MEM | BPF_W: if (is_12b_int(off)) { @@ -1283,6 +1293,8 @@ out_be: emit_imm(RV_REG_T1, off, ctx); emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx); emit(rv_lwu(rd, 0, RV_REG_T1), ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_LDX | BPF_MEM | BPF_DW: if (is_12b_int(off)) { @@ -1527,6 +1539,11 @@ static void bpf_flush_icache(void *start, void *end) flush_icache_range((unsigned long)start, (unsigned long)end); } +bool bpf_jit_needs_zext(void) +{ + return true; +} + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { bool tmp_blinded = false, extra_pass = false; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 109243fdb6ec..5d8570ed6cab 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +config ARCH_HAS_MEM_ENCRYPT + def_bool y + config MMU def_bool y @@ -30,7 +33,7 @@ config GENERIC_BUG_RELATIVE_POINTERS def_bool y config GENERIC_LOCKBREAK - def_bool y if SMP && PREEMPT + def_bool y if PREEMPT config PGSTE def_bool y if KVM @@ -113,7 +116,6 @@ config S390 select DYNAMIC_FTRACE if FUNCTION_TRACER select GENERIC_CLOCKEVENTS select GENERIC_CPU_AUTOPROBE - select GENERIC_CPU_DEVICES if !SMP select GENERIC_CPU_VULNERABILITIES select GENERIC_FIND_FIRST_BIT select GENERIC_SMP_IDLE_THREAD @@ -137,6 +139,7 @@ config S390 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_FAST_GUP select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD @@ -144,7 +147,6 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS - select HAVE_GENERIC_GUP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -187,6 +189,8 @@ config S390 select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME select HAVE_NMI + select SWIOTLB + select GENERIC_ALLOCATOR config SCHED_OMIT_FRAME_POINTER @@ -399,27 +403,10 @@ config SYSVIPC_COMPAT config SMP def_bool y - prompt "Symmetric multi-processing support" - ---help--- - This enables support for systems with more than one CPU. If you have - a system with only one CPU, like most personal computers, say N. If - you have a system with more than one CPU, say Y. - - If you say N here, the kernel will run on uni- and multiprocessor - machines, but will use only one CPU of a multiprocessor machine. If - you say Y here, the kernel will run on many, but not all, - uniprocessor machines. On a uniprocessor machine, the kernel - will run faster if you say N here. - - See also the SMP-HOWTO available at - <http://www.tldp.org/docs.html#howto>. - - Even if you don't know what to do here, say Y. config NR_CPUS int "Maximum number of CPUs (2-512)" range 2 512 - depends on SMP default "64" help This allows you to specify the maximum number of CPUs which this @@ -431,12 +418,6 @@ config NR_CPUS config HOTPLUG_CPU def_bool y - prompt "Support for hot-pluggable CPUs" - depends on SMP - help - Say Y here to be able to turn CPUs off and on. CPUs - can be controlled through /sys/devices/system/cpu/cpu#. - Say N if you want to disable CPU hotplug. # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and @@ -448,7 +429,7 @@ config NODES_SPAN_OTHER_NODES config NUMA bool "NUMA support" - depends on SMP && SCHED_TOPOLOGY + depends on SCHED_TOPOLOGY default n help Enable NUMA support @@ -523,7 +504,6 @@ config SCHED_DRAWER config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" - depends on SMP select SCHED_SMT select SCHED_MC select SCHED_BOOK @@ -661,9 +641,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_SELECT_MEMORY_MODEL - def_bool y - config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y if SPARSEMEM @@ -763,7 +740,7 @@ config PCI_NR_FUNCTIONS This allows you to specify the maximum number of PCI functions which this kernel will support. -endif # PCI +endif # PCI config HAS_IOMEM def_bool PCI @@ -829,16 +806,15 @@ menu "Dump support" config CRASH_DUMP bool "kernel crash dumps" - depends on SMP select KEXEC help Generate crash dump after being started by kexec. Crash dump kernels are loaded in the main kernel with kexec-tools into a specially reserved region and then later executed after a crash by kdump/kexec. - Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this. + Refer to <file:Documentation/s390/zfcpdump.rst> for more details on this. This option also enables s390 zfcpdump. - See also <file:Documentation/s390/zfcpdump.txt> + See also <file:Documentation/s390/zfcpdump.rst> endmenu diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index b0920b35f87b..a6dc01a22048 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -88,6 +88,7 @@ CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_CHSC_SCH=y CONFIG_VFIO_AP=m +CONFIG_VFIO_CCW=m CONFIG_CRASH_DUMP=y CONFIG_BINFMT_MISC=m CONFIG_HIBERNATION=y @@ -498,6 +499,7 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_S390_AP_IOMMU=y +CONFIG_S390_CCW_IOMMU=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c59b922cb6c5..e4bc40073003 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -1,21 +1,22 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_USELIB=y CONFIG_AUDIT=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y -# CONFIG_CPU_ISOLATION is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y -CONFIG_CGROUPS=y +CONFIG_NUMA_BALANCING=y +# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_FREEZER=y @@ -26,98 +27,402 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y -CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set +CONFIG_CHECKPOINT_RESTORE=y CONFIG_BPF_SYSCALL=y CONFIG_USERFAULTFD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_LIVEPATCH=y -CONFIG_NR_CPUS=256 -CONFIG_NUMA=y -CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y -CONFIG_CRASH_DUMP=y -CONFIG_HIBERNATION=y -CONFIG_PM_DEBUG=y -CONFIG_CMM=m -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y -CONFIG_STATIC_KEYS_SELFTEST=y CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_SHA256=y CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_WBT=y +CONFIG_BLK_WBT_SQ=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_CFQ_GROUP_IOSCHED=y CONFIG_DEFAULT_DEADLINE=y -CONFIG_BINFMT_MISC=m +CONFIG_LIVEPATCH=y +CONFIG_TUNE_ZEC12=y +CONFIG_NR_CPUS=512 +CONFIG_NUMA=y +CONFIG_HZ_100=y +CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y +CONFIG_EXPOLINE=y +CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y +CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y CONFIG_ZBUD=m CONFIG_ZSMALLOC=m CONFIG_ZSMALLOC_STAT=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_PCI=y +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_S390=y +CONFIG_CHSC_SCH=y +CONFIG_VFIO_AP=m +CONFIG_VFIO_CCW=m +CONFIG_CRASH_DUMP=y +CONFIG_BINFMT_MISC=m +CONFIG_HIBERNATION=y +CONFIG_PM_DEBUG=y CONFIG_NET=y CONFIG_PACKET=y +CONFIG_PACKET_DIAG=m CONFIG_UNIX=y -CONFIG_NET_KEY=y +CONFIG_UNIX_DIAG=m +CONFIG_XFRM_USER=m +CONFIG_NET_KEY=m +CONFIG_SMC=m +CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_TABLES=m +CONFIG_NFT_CT=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_NAT=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NETFILTER_XT_SET=m +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_PE_SIP=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_CHAIN_ROUTE_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +CONFIG_NF_CONNTRACK_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_CHAIN_ROUTE_IPV6=m +CONFIG_NFT_CHAIN_NAT_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_NF_TABLES_BRIDGE=y +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m CONFIG_L2TP=m CONFIG_L2TP_DEBUGFS=m -CONFIG_VLAN_8021Q=y +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_BRIDGE=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y CONFIG_NET_SCHED=y CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m CONFIG_NET_SCH_SFQ=m CONFIG_NET_SCH_TEQL=m CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_CLS_BASIC=m CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=y +CONFIG_NET_CLS_BPF=m CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_DNS_RESOLVER=y +CONFIG_OPENVSWITCH=m +CONFIG_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_NET_PKTGEN=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 +CONFIG_CONNECTOR=y +CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=32768 CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_NVME=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_GENWQE=m +CONFIG_RAID_ATTRS=m CONFIG_SCSI=y -# CONFIG_SCSI_MQ_DEFAULT is not set CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=y -CONFIG_BLK_DEV_SR=y -CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m CONFIG_CHR_DEV_SG=y +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_ISCSI_TCP=m +CONFIG_SCSI_DEBUG=m CONFIG_ZFCP=y -CONFIG_SCSI_VIRTIO=y +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m CONFIG_MD=y +CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m CONFIG_MD_MULTIPATH=m -CONFIG_BLK_DEV_DM=y +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -125,71 +430,216 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m CONFIG_DM_SWITCH=m CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=m CONFIG_EQUALIZER=m +CONFIG_IFB=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_VXLAN=m CONFIG_TUN=m -CONFIG_VIRTIO_NET=y -# CONFIG_NET_VENDOR_ALACRITECH is not set -# CONFIG_NET_VENDOR_AURORA is not set -# CONFIG_NET_VENDOR_CORTINA is not set -# CONFIG_NET_VENDOR_SOLARFLARE is not set -# CONFIG_NET_VENDOR_SOCIONEXT is not set -# CONFIG_NET_VENDOR_SYNOPSYS is not set -# CONFIG_INPUT is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_CHELSIO is not set +# CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_MARVELL is not set +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_NATSEMI is not set +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_MPPE=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_ISM=m +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -# CONFIG_VT is not set -CONFIG_DEVKMEM=y +CONFIG_LEGACY_PTY_COUNT=0 +CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m -CONFIG_VIRTIO_BALLOON=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TN3270_FS=y +# CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_NOWAYOUT=y +CONFIG_SOFT_WATCHDOG=m +CONFIG_DIAG288_WATCHDOG=m +CONFIG_DRM=y +CONFIG_DRM_VIRTIO_GPU=y +CONFIG_FRAMEBUFFER_CONSOLE=y +# CONFIG_HID is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=y +CONFIG_S390_AP_IOMMU=y +CONFIG_S390_CCW_IOMMU=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y +CONFIG_JBD2_DEBUG=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +CONFIG_JFS_STATISTICS=y CONFIG_XFS_FS=y CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y CONFIG_XFS_RT=y +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_NILFS2_FS=m +CONFIG_FS_DAX=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FS_ENCRYPTION=y CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y +CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m +CONFIG_FSCACHE=m +CONFIG_CACHEFILES=m +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_NTFS_FS=m +CONFIG_NTFS_RW=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y -# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_CONFIGFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_ROMFS_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFSD=m +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_CIFS=m +CONFIG_CIFS_STATS=y +CONFIG_CIFS_STATS2=y +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=1024 +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_RCU_TORTURE_TEST=m +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +CONFIG_LATENCYTOP=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_HIST_TRIGGERS=y +CONFIG_LKDTM=m +CONFIG_PERCPU_TEST=m +CONFIG_ATOMIC64_SELFTEST=y +CONFIG_TEST_BPF=m +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_S390_PTDUMP=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_ENCRYPTED_KEYS=m +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 +CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_INTEGRITY_SIGNATURE=y +CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y +CONFIG_IMA=y +CONFIG_IMA_DEFAULT_HASH_SHA256=y +CONFIG_IMA_WRITE_POLICY=y +CONFIG_IMA_APPRAISE=y +CONFIG_CRYPTO_FIPS=y +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_USER=m +# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set +CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=m -CONFIG_CRYPTO_CBC=y -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_SHA3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m @@ -199,16 +649,16 @@ CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SM4=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_ANSI_CPRNG=m CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_ZCRYPT=m CONFIG_PKEY=m CONFIG_CRYPTO_PAES_S390=m @@ -217,38 +667,14 @@ CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRC7=m -# CONFIG_XZ_DEC_X86 is not set -# CONFIG_XZ_DEC_POWERPC is not set -# CONFIG_XZ_DEC_IA64 is not set -# CONFIG_XZ_DEC_ARM is not set -# CONFIG_XZ_DEC_ARMTHUMB is not set -# CONFIG_XZ_DEC_SPARC is not set -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_GDB_SCRIPTS=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_DEBUG_SECTION_MISMATCH=y -CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_PAGEALLOC=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_PANIC_ON_OOPS=y -CONFIG_PROVE_LOCKING=y -CONFIG_LOCK_STAT=y -CONFIG_DEBUG_LOCKDEP=y -CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_SG=y -CONFIG_DEBUG_NOTIFIERS=y -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -CONFIG_LATENCYTOP=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -# CONFIG_RUNTIME_TESTING_MENU is not set -CONFIG_S390_PTDUMP=y +CONFIG_CRC8=m +CONFIG_CORDIC=m +CONFIG_CMM=m +CONFIG_APPLDATA_BASE=y +CONFIG_KVM=m +CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_VSOCK=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig deleted file mode 100644 index 09aa5cb14873..000000000000 --- a/arch/s390/configs/performance_defconfig +++ /dev/null @@ -1,678 +0,0 @@ -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_AUDIT=y -CONFIG_NO_HZ_IDLE=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_NUMA_BALANCING=y -# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_BLK_CGROUP=y -CONFIG_CFS_BANDWIDTH=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_NAMESPACES=y -CONFIG_USER_NS=y -CONFIG_SCHED_AUTOGROUP=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_BPF_SYSCALL=y -CONFIG_USERFAULTFD=y -# CONFIG_COMPAT_BRK is not set -CONFIG_PROFILING=y -CONFIG_OPROFILE=m -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -CONFIG_MODULE_SIG_SHA256=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_WBT=y -CONFIG_BLK_WBT_SQ=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_IBM_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_DEFAULT_DEADLINE=y -CONFIG_LIVEPATCH=y -CONFIG_TUNE_ZEC12=y -CONFIG_NR_CPUS=512 -CONFIG_NUMA=y -CONFIG_HZ_100=y -CONFIG_KEXEC_FILE=y -CONFIG_KEXEC_VERIFY_SIG=y -CONFIG_EXPOLINE=y -CONFIG_EXPOLINE_AUTO=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_KSM=y -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -CONFIG_ZBUD=m -CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_STAT=y -CONFIG_DEFERRED_STRUCT_PAGE_INIT=y -CONFIG_IDLE_PAGE_TRACKING=y -CONFIG_PCI=y -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_S390=y -CONFIG_CHSC_SCH=y -CONFIG_VFIO_AP=m -CONFIG_CRASH_DUMP=y -CONFIG_BINFMT_MISC=m -CONFIG_HIBERNATION=y -CONFIG_PM_DEBUG=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=m -CONFIG_UNIX=y -CONFIG_UNIX_DIAG=m -CONFIG_XFRM_USER=m -CONFIG_NET_KEY=m -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IPGRE=m -CONFIG_NET_IPGRE_BROADCAST=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m -CONFIG_INET_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_INET6_XFRM_MODE_TRANSPORT=m -CONFIG_INET6_XFRM_MODE_TUNNEL=m -CONFIG_INET6_XFRM_MODE_BEET=m -CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_NETFILTER=y -CONFIG_NF_CONNTRACK=m -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_TABLES=m -CONFIG_NFT_CT=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_NAT=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NETFILTER_XT_SET=m -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -CONFIG_IP_SET=m -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_PE_SIP=m -CONFIG_NF_CONNTRACK_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_CHAIN_ROUTE_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_NF_CONNTRACK_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_NF_TABLES_BRIDGE=y -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -CONFIG_L2TP=m -CONFIG_L2TP_DEBUGFS=m -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_BRIDGE=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=y -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_DNS_RESOLVER=y -CONFIG_OPENVSWITCH=m -CONFIG_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_CGROUP_NET_PRIO=y -CONFIG_BPF_JIT=y -CONFIG_NET_PKTGEN=m -CONFIG_DEVTMPFS=y -CONFIG_DMA_CMA=y -CONFIG_CMA_SIZE_MBYTES=0 -CONFIG_CONNECTOR=y -CONFIG_ZRAM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=32768 -CONFIG_VIRTIO_BLK=y -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_NVME=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_GENWQE=m -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_CHR_DEV_OSST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=y -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=y -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SRP_ATTRS=m -CONFIG_ISCSI_TCP=m -CONFIG_SCSI_DEBUG=m -CONFIG_ZFCP=y -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -CONFIG_SCSI_OSD_INITIATOR=m -CONFIG_SCSI_OSD_ULD=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_MD_LINEAR=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_BLK_DEV_DM=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_DELAY=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_SWITCH=m -CONFIG_NETDEVICES=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_EQUALIZER=m -CONFIG_IFB=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_VXLAN=m -CONFIG_TUN=m -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -# CONFIG_NET_VENDOR_ARC is not set -# CONFIG_NET_VENDOR_CHELSIO is not set -# CONFIG_NET_VENDOR_INTEL is not set -# CONFIG_NET_VENDOR_MARVELL is not set -CONFIG_MLX4_EN=m -CONFIG_MLX5_CORE=m -CONFIG_MLX5_CORE_EN=y -# CONFIG_NET_VENDOR_NATSEMI is not set -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_MPPE=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_ISM=m -CONFIG_INPUT_EVDEV=y -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_RAW_DRIVER=m -CONFIG_HANGCHECK_TIMER=m -CONFIG_TN3270_FS=y -# CONFIG_HWMON is not set -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_NOWAYOUT=y -CONFIG_SOFT_WATCHDOG=m -CONFIG_DIAG288_WATCHDOG=m -CONFIG_DRM=y -CONFIG_DRM_VIRTIO_GPU=y -CONFIG_FRAMEBUFFER_CONSOLE=y -# CONFIG_HID is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_ACCESS=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_VFIO=m -CONFIG_VFIO_PCI=m -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_INPUT=y -CONFIG_S390_AP_IOMMU=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_JBD2_DEBUG=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=y -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_BTRFS_FS=y -CONFIG_BTRFS_FS_POSIX_ACL=y -CONFIG_NILFS2_FS=m -CONFIG_FS_DAX=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FS_ENCRYPTION=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_AUTOFS4_FS=m -CONFIG_FUSE_FS=y -CONFIG_CUSE=m -CONFIG_OVERLAY_FS=m -CONFIG_FSCACHE=m -CONFIG_CACHEFILES=m -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_NTFS_FS=m -CONFIG_NTFS_RW=y -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_HUGETLBFS=y -CONFIG_CONFIGFS_FS=m -CONFIG_ECRYPT_FS=m -CONFIG_CRAMFS=m -CONFIG_SQUASHFS=m -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_ROMFS_FS=m -CONFIG_NFS_FS=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFSD=m -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_CIFS=m -CONFIG_CIFS_STATS=y -CONFIG_CIFS_STATS2=y -CONFIG_CIFS_WEAK_PW_HASH=y -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -# CONFIG_CIFS_DEBUG is not set -CONFIG_CIFS_DFS_UPCALL=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_GDB_SCRIPTS=y -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=1024 -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_MEMORY_INIT=y -CONFIG_PANIC_ON_OOPS=y -CONFIG_RCU_TORTURE_TEST=m -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -CONFIG_LATENCYTOP=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_HIST_TRIGGERS=y -CONFIG_LKDTM=m -CONFIG_PERCPU_TEST=m -CONFIG_ATOMIC64_SELFTEST=y -CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_BIG_KEYS=y -CONFIG_ENCRYPTED_KEYS=m -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 -CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_INTEGRITY_SIGNATURE=y -CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y -CONFIG_IMA=y -CONFIG_IMA_DEFAULT_HASH_SHA256=y -CONFIG_IMA_WRITE_POLICY=y -CONFIG_IMA_APPRAISE=y -CONFIG_CRYPTO_FIPS=y -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_USER=m -# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA512=m -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=m -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_ZCRYPT=m -CONFIG_PKEY=m -CONFIG_CRYPTO_PAES_S390=m -CONFIG_CRYPTO_SHA1_S390=m -CONFIG_CRYPTO_SHA256_S390=m -CONFIG_CRYPTO_SHA512_S390=m -CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=y -CONFIG_CRC7=m -CONFIG_CRC8=m -CONFIG_CORDIC=m -CONFIG_CMM=m -CONFIG_APPLDATA_BASE=y -CONFIG_KVM=m -CONFIG_KVM_S390_UCONTROL=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_VSOCK=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 7dc7f58c4287..d92bab844b73 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -24,7 +24,6 @@ CONFIG_CRASH_DUMP=y # CONFIG_SECCOMP is not set CONFIG_NET=y # CONFIG_IUCV is not set -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_BLK_DEV_RAM=y # CONFIG_BLK_DEV_XPRAM is not set diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index 86aed30fad3a..eeeb6a7737a4 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -137,7 +137,7 @@ static struct shash_alg ghash_alg = { static int __init ghash_mod_init(void) { if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_GHASH)) - return -EOPNOTSUPP; + return -ENODEV; return crypto_register_shash(&ghash_alg); } diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 12cca467af7d..d977643fa627 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -824,7 +824,7 @@ static int __init prng_init(void) /* check if the CPU has a PRNG */ if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) - return -EOPNOTSUPP; + return -ENODEV; /* check if TRNG subfunction is available */ if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) @@ -837,7 +837,7 @@ static int __init prng_init(void) if (prng_mode == PRNG_MODE_SHA512) { pr_err("The prng module cannot " "start in SHA-512 mode\n"); - return -EOPNOTSUPP; + return -ENODEV; } prng_mode = PRNG_MODE_TDES; } else diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 009572e8276d..7c15542d3685 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -86,7 +86,7 @@ static struct shash_alg alg = { static int __init sha1_s390_init(void) { if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) - return -EOPNOTSUPP; + return -ENODEV; return crypto_register_shash(&alg); } diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index 62833a1d8724..af7505148f80 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -117,7 +117,7 @@ static int __init sha256_s390_init(void) int ret; if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) - return -EOPNOTSUPP; + return -ENODEV; ret = crypto_register_shash(&sha256_alg); if (ret < 0) goto out; diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index be589c340d15..ad29db085a18 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -127,7 +127,7 @@ static int __init init(void) int ret; if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) - return -EOPNOTSUPP; + return -ENODEV; if ((ret = crypto_register_shash(&sha512_alg)) < 0) goto out; if ((ret = crypto_register_shash(&sha384_alg)) < 0) diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index c10d2ee2dfda..01936fdfaddb 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -11,6 +11,7 @@ #define _ASM_S390_AIRQ_H #include <linux/bit_spinlock.h> +#include <linux/dma-mapping.h> struct airq_struct { struct hlist_node list; /* Handler queueing. */ @@ -29,6 +30,7 @@ void unregister_adapter_interrupt(struct airq_struct *airq); /* Adapter interrupt bit vector */ struct airq_iv { unsigned long *vector; /* Adapter interrupt bit vector */ + dma_addr_t vector_dma; /* Adapter interrupt bit vector dma */ unsigned long *avail; /* Allocation bit mask for the bit vector */ unsigned long *bitlock; /* Lock bit mask for the bit vector */ unsigned long *ptr; /* Pointer associated with each bit */ diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index fd20ab5d4cf7..491ad53a0d4e 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -84,9 +84,9 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) #define ATOMIC64_INIT(i) { (i) } -static inline long atomic64_read(const atomic64_t *v) +static inline s64 atomic64_read(const atomic64_t *v) { - long c; + s64 c; asm volatile( " lg %0,%1\n" @@ -94,49 +94,49 @@ static inline long atomic64_read(const atomic64_t *v) return c; } -static inline void atomic64_set(atomic64_t *v, long i) +static inline void atomic64_set(atomic64_t *v, s64 i) { asm volatile( " stg %1,%0\n" : "=Q" (v->counter) : "d" (i)); } -static inline long atomic64_add_return(long i, atomic64_t *v) +static inline s64 atomic64_add_return(s64 i, atomic64_t *v) { - return __atomic64_add_barrier(i, &v->counter) + i; + return __atomic64_add_barrier(i, (long *)&v->counter) + i; } -static inline long atomic64_fetch_add(long i, atomic64_t *v) +static inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { - return __atomic64_add_barrier(i, &v->counter); + return __atomic64_add_barrier(i, (long *)&v->counter); } -static inline void atomic64_add(long i, atomic64_t *v) +static inline void atomic64_add(s64 i, atomic64_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - __atomic64_add_const(i, &v->counter); + __atomic64_add_const(i, (long *)&v->counter); return; } #endif - __atomic64_add(i, &v->counter); + __atomic64_add(i, (long *)&v->counter); } #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +static inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - return __atomic64_cmpxchg(&v->counter, old, new); + return __atomic64_cmpxchg((long *)&v->counter, old, new); } #define ATOMIC64_OPS(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ +static inline void atomic64_##op(s64 i, atomic64_t *v) \ { \ - __atomic64_##op(i, &v->counter); \ + __atomic64_##op(i, (long *)&v->counter); \ } \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ +static inline long atomic64_fetch_##op(s64 i, atomic64_t *v) \ { \ - return __atomic64_##op##_barrier(i, &v->counter); \ + return __atomic64_##op##_barrier(i, (long *)&v->counter); \ } ATOMIC64_OPS(and) @@ -145,8 +145,8 @@ ATOMIC64_OPS(xor) #undef ATOMIC64_OPS -#define atomic64_sub_return(_i, _v) atomic64_add_return(-(long)(_i), _v) -#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long)(_i), _v) -#define atomic64_sub(_i, _v) atomic64_add(-(long)(_i), _v) +#define atomic64_sub_return(_i, _v) atomic64_add_return(-(s64)(_i), _v) +#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(s64)(_i), _v) +#define atomic64_sub(_i, _v) atomic64_add(-(s64)(_i), _v) #endif /* __ARCH_S390_ATOMIC__ */ diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index a29dd430fb40..865ce1cb86d5 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -226,6 +226,10 @@ extern int ccw_device_enable_console(struct ccw_device *); extern void ccw_device_wait_idle(struct ccw_device *); extern int ccw_device_force_console(struct ccw_device *); +extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size); +extern void ccw_device_dma_free(struct ccw_device *cdev, + void *cpu_addr, size_t size); + int ccw_device_siosl(struct ccw_device *); extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 1727180e8ca1..b5bfb3123cb1 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -7,6 +7,7 @@ #include <linux/spinlock.h> #include <linux/bitops.h> +#include <linux/genalloc.h> #include <asm/types.h> #define LPM_ANYPATH 0xff @@ -264,6 +265,36 @@ struct ciw { #define CIW_TYPE_RNI 0x2 /* read node identifier */ /* + * Node Descriptor as defined in SA22-7204, "Common I/O-Device Commands" + */ + +#define ND_VALIDITY_VALID 0 +#define ND_VALIDITY_OUTDATED 1 +#define ND_VALIDITY_INVALID 2 + +struct node_descriptor { + /* Flags. */ + union { + struct { + u32 validity:3; + u32 reserved:5; + } __packed; + u8 byte0; + } __packed; + + /* Node parameters. */ + u32 params:24; + + /* Node ID. */ + char type[6]; + char model[3]; + char manufacturer[3]; + char plant[2]; + char seq[12]; + u16 tag; +} __packed; + +/* * Flags used as input parameters for do_IO() */ #define DOIO_ALLOW_SUSPEND 0x0001 /* allow for channel prog. suspend */ @@ -328,6 +359,16 @@ static inline u8 pathmask_to_pos(u8 mask) void channel_subsystem_reinit(void); extern void css_schedule_reprobe(void); +extern void *cio_dma_zalloc(size_t size); +extern void cio_dma_free(void *cpu_addr, size_t size); +extern struct device *cio_get_dma_css_dev(void); + +void *cio_gp_dma_zalloc(struct gen_pool *gp_dma, struct device *dma_dev, + size_t size); +void cio_gp_dma_free(struct gen_pool *gp_dma, void *cpu_addr, size_t size); +void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev); +struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages); + /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index 3bda757317cf..0cf6b53587db 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -112,13 +112,8 @@ union ctlreg2 { }; }; -#ifdef CONFIG_SMP -# define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) -# define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit) -#else -# define ctl_set_bit(cr, bit) __ctl_set_bit(cr, bit) -# define ctl_clear_bit(cr, bit) __ctl_clear_bit(cr, bit) -#endif +#define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) +#define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit) #endif /* __ASSEMBLY__ */ #endif /* __ASM_CTL_REG_H */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index c305d39f5016..310134015541 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -107,13 +107,37 @@ void debug_unregister(debug_info_t *id); void debug_set_level(debug_info_t *id, int new_level); void debug_set_critical(void); + void debug_stop_all(void); +/** + * debug_level_enabled() - Returns true if debug events for the specified + * level would be logged. Otherwise returns false. + * + * @id: handle for debug log + * @level: debug level + * + * Return: + * - %true if level is less or equal to the current debug level. + */ static inline bool debug_level_enabled(debug_info_t *id, int level) { return level <= id->level; } +/** + * debug_event() - writes binary debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @data: pointer to data for debug entry + * @length: length of data in bytes + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_event(debug_info_t *id, int level, void *data, int length) { @@ -122,6 +146,18 @@ static inline debug_entry_t *debug_event(debug_info_t *id, int level, return debug_event_common(id, level, data, length); } +/** + * debug_int_event() - writes unsigned integer debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, unsigned int tag) { @@ -132,6 +168,18 @@ static inline debug_entry_t *debug_int_event(debug_info_t *id, int level, return debug_event_common(id, level, &t, sizeof(unsigned int)); } +/** + * debug_long_event() - writes unsigned long debug entry to active debug area + * (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @tag: long integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, unsigned long tag) { @@ -142,6 +190,18 @@ static inline debug_entry_t *debug_long_event(debug_info_t *id, int level, return debug_event_common(id, level, &t, sizeof(unsigned long)); } +/** + * debug_text_event() - writes string debug entry in ascii format to active + * debug area (if level <= actual debug level) + * + * @id: handle for debug log + * @level: debug level + * @txt: string for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, const char *txt) { @@ -152,12 +212,28 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.txt for more details! + * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) __attribute__ ((format(printf, 3, 4))); +/** + * debug_sprintf_event() - writes debug entry with format string + * and varargs (longs) to active debug area + * (if level $<=$ actual debug level). + * + * @_id: handle for debug log + * @_level: debug level + * @_fmt: format string for debug entry + * @...: varargs used as in sprintf() + * + * Return: + * - Address of written debug entry + * - %NULL if error + * + * floats and long long datatypes cannot be used as varargs. + */ #define debug_sprintf_event(_id, _level, _fmt, ...) \ ({ \ debug_entry_t *__ret; \ @@ -172,6 +248,20 @@ __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) __ret; \ }) +/** + * debug_exception() - writes binary debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @data: pointer to data for debug entry + * @length: length of data in bytes + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_exception(debug_info_t *id, int level, void *data, int length) { @@ -180,6 +270,19 @@ static inline debug_entry_t *debug_exception(debug_info_t *id, int level, return debug_exception_common(id, level, data, length); } +/** + * debug_int_exception() - writes unsigned int debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @tag: integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, unsigned int tag) { @@ -190,6 +293,19 @@ static inline debug_entry_t *debug_int_exception(debug_info_t *id, int level, return debug_exception_common(id, level, &t, sizeof(unsigned int)); } +/** + * debug_long_exception() - writes long debug entry to active debug area + * (if level <= actual debug level) + * and switches to next debug area + * + * @id: handle for debug log + * @level: debug level + * @tag: long integer value for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, unsigned long tag) { @@ -200,6 +316,20 @@ static inline debug_entry_t *debug_long_exception (debug_info_t *id, int level, return debug_exception_common(id, level, &t, sizeof(unsigned long)); } +/** + * debug_text_exception() - writes string debug entry in ascii format to active + * debug area (if level <= actual debug level) + * and switches to next debug area + * area + * + * @id: handle for debug log + * @level: debug level + * @txt: string for debug entry + * + * Return: + * - Address of written debug entry + * - %NULL if error + */ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, const char *txt) { @@ -210,12 +340,30 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.txt for more details! + * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) __attribute__ ((format(printf, 3, 4))); + +/** + * debug_sprintf_exception() - writes debug entry with format string and + * varargs (longs) to active debug area + * (if level <= actual debug level) + * and switches to next debug area. + * + * @_id: handle for debug log + * @_level: debug level + * @_fmt: format string for debug entry + * @...: varargs used as in sprintf() + * + * Return: + * - Address of written debug entry + * - %NULL if error + * + * floats and long long datatypes cannot be used as varargs. + */ #define debug_sprintf_exception(_id, _level, _fmt, ...) \ ({ \ debug_entry_t *__ret; \ @@ -231,6 +379,7 @@ __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) }) int debug_register_view(debug_info_t *id, struct debug_view *view); + int debug_unregister_view(debug_info_t *id, struct debug_view *view); /* diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index e78cda94456b..68c476b20b57 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -59,6 +59,18 @@ static inline int test_facility(unsigned long nr) return __test_facility(nr, &S390_lowcore.stfle_fac_list); } +static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size) +{ + register unsigned long reg0 asm("0") = size - 1; + + asm volatile( + ".insn s,0xb2b00000,0(%1)" /* stfle */ + : "+d" (reg0) + : "a" (stfle_fac_list) + : "memory", "cc"); + return reg0; +} + /** * stfle - Store facility list extended * @stfle_fac_list: array where facility list can be stored @@ -75,13 +87,8 @@ static inline void __stfle(u64 *stfle_fac_list, int size) memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4); if (S390_lowcore.stfl_fac_list & 0x01000000) { /* More facility bits available with stfle */ - register unsigned long reg0 asm("0") = size - 1; - - asm volatile(".insn s,0xb2b00000,0(%1)" /* stfle */ - : "+d" (reg0) - : "a" (stfle_fac_list) - : "memory", "cc"); - nr = (reg0 + 1) * 8; /* # bytes stored by stfle */ + nr = __stfle_asm(stfle_fac_list, size); + nr = min_t(unsigned long, (nr + 1) * 8, size * 8); } memset((char *) stfle_fac_list + nr, 0, size * 8 - nr); } diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index 15578fd762f6..6fb7aced104a 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -122,8 +122,7 @@ idal_buffer_alloc(size_t size, int page_order) nr_ptrs = (size + IDA_BLOCK_SIZE - 1) >> IDA_SIZE_LOG; nr_chunks = (4096 << page_order) >> IDA_SIZE_LOG; - ib = kmalloc(sizeof(struct idal_buffer) + nr_ptrs*sizeof(void *), - GFP_DMA | GFP_KERNEL); + ib = kmalloc(struct_size(ib, data, nr_ptrs), GFP_DMA | GFP_KERNEL); if (ib == NULL) return ERR_PTR(-ENOMEM); ib->size = size; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2b00a3ebee08..abe60268335d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -18,6 +18,7 @@ #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/seqlock.h> +#include <linux/module.h> #include <asm/debug.h> #include <asm/cpu.h> #include <asm/fpu/api.h> @@ -720,8 +721,14 @@ struct kvm_s390_cpu_model { unsigned short ibc; }; +struct kvm_s390_module_hook { + int (*hook)(struct kvm_vcpu *vcpu); + struct module *owner; +}; + struct kvm_s390_crypto { struct kvm_s390_crypto_cb *crycb; + struct kvm_s390_module_hook *pqap_hook; __u32 crycbd; __u8 aes_kw; __u8 dea_kw; @@ -905,7 +912,6 @@ extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_check_processor_compat(void *rtn) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..3eb018508190 --- /dev/null +++ b/arch/s390/include/asm/mem_encrypt.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef S390_MEM_ENCRYPT_H__ +#define S390_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#define sme_me_mask 0ULL + +static inline bool sme_active(void) { return false; } +extern bool sev_active(void); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASSEMBLY__ */ + +#endif /* S390_MEM_ENCRYPT_H__ */ diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 305befd55326..a2399eff84ca 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -194,6 +194,11 @@ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); #ifdef CONFIG_PCI +static inline bool zpci_use_mio(struct zpci_dev *zdev) +{ + return static_branch_likely(&have_mio) && zdev->mio_capable; +} + /* Error handling and recovery */ void zpci_event_error(void *); void zpci_event_availability(void *); diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 0095ddb58ff6..50b4ce8cddfd 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -16,7 +16,7 @@ * per cpu area, use weak definitions to force the compiler to * generate external references. */ -#if defined(CONFIG_SMP) && defined(MODULE) +#if defined(MODULE) #define ARCH_NEEDS_WEAK_PER_CPU #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9f0195d5fa16..9b274fcaacb6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1270,14 +1270,8 @@ static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; return end <= current->mm->context.asce_limit; } #define gup_fast_permitted gup_fast_permitted diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index b0fcbc37b637..14883b1562e0 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -36,6 +36,7 @@ #ifndef __ASSEMBLY__ +#include <linux/cpumask.h> #include <linux/linkage.h> #include <linux/irqflags.h> #include <asm/cpu.h> @@ -221,12 +222,6 @@ static __no_kasan_or_inline unsigned short stap(void) return cpu_address; } -/* - * Give up the time slice of the virtual PU. - */ -#define cpu_relax_yield cpu_relax_yield -void cpu_relax_yield(void); - #define cpu_relax() barrier() #define ECAG_CACHE_ATTRIBUTE 0 diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 3907ead27ffa..b157a81fb977 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -9,9 +9,6 @@ #define __ASM_SMP_H #include <asm/sigp.h> - -#ifdef CONFIG_SMP - #include <asm/lowcore.h> #define raw_smp_processor_id() (S390_lowcore.cpu_nr) @@ -40,33 +37,6 @@ extern int smp_cpu_get_polarization(int cpu); extern void smp_fill_possible_mask(void); extern void smp_detect_cpus(void); -#else /* CONFIG_SMP */ - -#define smp_cpu_mtid 0 - -static inline void smp_call_ipl_cpu(void (*func)(void *), void *data) -{ - func(data); -} - -static inline void smp_call_online_cpu(void (*func)(void *), void *data) -{ - func(data); -} - -static inline void smp_emergency_stop(void) -{ -} - -static inline int smp_find_processor_id(u16 address) { return 0; } -static inline int smp_store_status(int cpu) { return 0; } -static inline int smp_vcpu_scheduled(int cpu) { return 1; } -static inline void smp_yield_cpu(int cpu) { } -static inline void smp_fill_possible_mask(void) { } -static inline void smp_detect_cpus(void) { } - -#endif /* CONFIG_SMP */ - static inline void smp_stop_cpu(void) { u16 pcpu = stap(); @@ -83,14 +53,9 @@ static inline int smp_get_base_cpu(int cpu) return cpu - (cpu % (smp_cpu_mtid + 1)); } -#ifdef CONFIG_HOTPLUG_CPU extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); -#else -static inline int smp_rescan_cpus(void) { return 0; } -static inline void cpu_die(void) { } -#endif #endif /* __ASM_SMP_H */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 0a29588aa00b..c02bff33f6c7 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -20,11 +20,7 @@ extern int spin_retry; -#ifndef CONFIG_SMP -static inline bool arch_vcpu_is_preempted(int cpu) { return false; } -#else bool arch_vcpu_is_preempted(int cpu); -#endif #define vcpu_is_preempted arch_vcpu_is_preempted diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 8c840f0904f3..82703e03f35d 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -32,7 +32,6 @@ static inline void __tlb_flush_idte(unsigned long asce) : : "a" (opt), "a" (asce) : "cc"); } -#ifdef CONFIG_SMP void smp_ptlb_all(void); /* @@ -83,22 +82,6 @@ static inline void __tlb_flush_kernel(void) else __tlb_flush_global(); } -#else -#define __tlb_flush_global() __tlb_flush_local() - -/* - * Flush TLB entries for a specific ASCE on all CPUs. - */ -static inline void __tlb_flush_mm(struct mm_struct *mm) -{ - __tlb_flush_local(); -} - -static inline void __tlb_flush_kernel(void) -{ - __tlb_flush_local(); -} -#endif static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h index 6eb2ef105d87..d827b5b9a32c 100644 --- a/arch/s390/include/asm/unwind.h +++ b/arch/s390/include/asm/unwind.h @@ -79,23 +79,4 @@ static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} -#ifdef CONFIG_KASAN -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) -#else -#define READ_ONCE_TASK_STACK(task, x) READ_ONCE(x) -#endif - #endif /* _ASM_S390_UNWIND_H */ diff --git a/arch/s390/include/uapi/asm/runtime_instr.h b/arch/s390/include/uapi/asm/runtime_instr.h index 45c9ec984e6b..455da46e3193 100644 --- a/arch/s390/include/uapi/asm/runtime_instr.h +++ b/arch/s390/include/uapi/asm/runtime_instr.h @@ -57,7 +57,7 @@ struct runtime_instr_cb { __u64 sf; __u64 rsic; __u64 reserved8; -} __packed __aligned(8); +} __attribute__((__packed__, __aligned__(8))); static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb) { diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index b0478d01a0c5..0f255b54b051 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -53,6 +53,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o +obj-y += smp.o extra-y += head64.o vmlinux.lds @@ -60,7 +61,6 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o obj-$(CONFIG_AUDIT) += audit.o diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 6f2a193ccccc..38d4bdbc34b9 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -194,7 +194,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) load_sigregs(); return regs->gprs[2]; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -217,7 +217,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) load_sigregs(); return regs->gprs[2]; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 0ebf08c3b35e..6d321f5f101d 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -647,11 +647,23 @@ static int debug_close(struct inode *inode, struct file *file) return 0; /* success */ } -/* - * debug_register_mode: - * - Creates and initializes debug area for the caller - * The mode parameter allows to specify access rights for the s390dbf files - * - Returns handle for debug area +/** + * debug_register_mode() - creates and initializes debug area. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages_per_area: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * @mode: File mode for debugfs files. E.g. S_IRWXUGO + * @uid: User ID for debugfs files. Currently only 0 is supported. + * @gid: Group ID for debugfs files. Currently only 0 is supported. + * + * Return: + * - Handle for generated debug area + * - %NULL if register failed + * + * Allocates memory for a debug log. + * Must not be called within an interrupt handler. */ debug_info_t *debug_register_mode(const char *name, int pages_per_area, int nr_areas, int buf_size, umode_t mode, @@ -681,10 +693,21 @@ out: } EXPORT_SYMBOL(debug_register_mode); -/* - * debug_register: - * - creates and initializes debug area for the caller - * - returns handle for debug area +/** + * debug_register() - creates and initializes debug area with default file mode. + * + * @name: Name of debug log (e.g. used for debugfs entry) + * @pages_per_area: Number of pages, which will be allocated per area + * @nr_areas: Number of debug areas + * @buf_size: Size of data area in each debug entry + * + * Return: + * - Handle for generated debug area + * - %NULL if register failed + * + * Allocates memory for a debug log. + * The debugfs file mode access permissions are read and write for user. + * Must not be called within an interrupt handler. */ debug_info_t *debug_register(const char *name, int pages_per_area, int nr_areas, int buf_size) @@ -694,9 +717,13 @@ debug_info_t *debug_register(const char *name, int pages_per_area, } EXPORT_SYMBOL(debug_register); -/* - * debug_unregister: - * - give back debug area +/** + * debug_unregister() - give back debug area. + * + * @id: handle for debug log + * + * Return: + * none */ void debug_unregister(debug_info_t *id) { @@ -745,9 +772,14 @@ out: return rc; } -/* - * debug_set_level: - * - set actual debug level +/** + * debug_set_level() - Sets new actual debug level if new_level is valid. + * + * @id: handle for debug log + * @new_level: new debug level + * + * Return: + * none */ void debug_set_level(debug_info_t *id, int new_level) { @@ -873,6 +905,14 @@ static struct ctl_table s390dbf_dir_table[] = { static struct ctl_table_header *s390dbf_sysctl_header; +/** + * debug_stop_all() - stops the debug feature if stopping is allowed. + * + * Return: + * - none + * + * Currently used in case of a kernel oops. + */ void debug_stop_all(void) { if (debug_stoppable) @@ -880,6 +920,17 @@ void debug_stop_all(void) } EXPORT_SYMBOL(debug_stop_all); +/** + * debug_set_critical() - event/exception functions try lock instead of spin. + * + * Return: + * - none + * + * Currently used in case of stopping all CPUs but the current one. + * Once in this state, functions to write a debug entry for an + * event or exception no longer spin on the debug area lock, + * but only try to get it and fail if they do not get the lock. + */ void debug_set_critical(void) { debug_critical = 1; @@ -1036,8 +1087,16 @@ debug_entry_t *__debug_sprintf_exception(debug_info_t *id, int level, char *stri } EXPORT_SYMBOL(__debug_sprintf_exception); -/* - * debug_register_view: +/** + * debug_register_view() - registers new debug view and creates debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error */ int debug_register_view(debug_info_t *id, struct debug_view *view) { @@ -1077,8 +1136,16 @@ out: } EXPORT_SYMBOL(debug_register_view); -/* - * debug_unregister_view: +/** + * debug_unregister_view() - unregisters debug view and removes debugfs + * dir entry + * + * @id: handle for debug log + * @view: pointer to debug view struct + * + * Return: + * - 0 : ok + * - < 0: Error */ int debug_unregister_view(debug_info_t *id, struct debug_view *view) { diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index b2c68fbf2634..7abe6ae261b4 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -242,6 +242,7 @@ static const unsigned char formats[][6] = { [INSTR_RRF_U0FF] = { F_24, U4_16, F_28, 0, 0, 0 }, [INSTR_RRF_U0RF] = { R_24, U4_16, F_28, 0, 0, 0 }, [INSTR_RRF_U0RR] = { R_24, R_28, U4_16, 0, 0, 0 }, + [INSTR_RRF_URR] = { R_24, R_28, U8_16, 0, 0, 0 }, [INSTR_RRF_UUFF] = { F_24, U4_16, F_28, U4_20, 0, 0 }, [INSTR_RRF_UUFR] = { F_24, U4_16, R_28, U4_20, 0, 0 }, [INSTR_RRF_UURF] = { R_24, U4_16, F_28, U4_20, 0, 0 }, @@ -306,7 +307,7 @@ static const unsigned char formats[][6] = { [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, - [INSTR_VRR_RV0U] = { R_8, V_12, U4_24, 0, 0, 0 }, + [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, @@ -326,10 +327,8 @@ static const unsigned char formats[][6] = { [INSTR_VRS_RVRDU] = { R_8, V_12, D_20, B_16, U4_32, 0 }, [INSTR_VRS_VRRD] = { V_8, R_12, D_20, B_16, 0, 0 }, [INSTR_VRS_VRRDU] = { V_8, R_12, D_20, B_16, U4_32, 0 }, - [INSTR_VRS_VVRD] = { V_8, V_12, D_20, B_16, 0, 0 }, [INSTR_VRS_VVRDU] = { V_8, V_12, D_20, B_16, U4_32, 0 }, [INSTR_VRV_VVXRDU] = { V_8, D_20, VX_12, B_16, U4_32, 0 }, - [INSTR_VRX_VRRD] = { V_8, D_20, X_12, B_16, 0, 0 }, [INSTR_VRX_VRRDU] = { V_8, D_20, X_12, B_16, U4_32, 0 }, [INSTR_VRX_VV] = { V_8, V_12, 0, 0, 0, 0 }, [INSTR_VSI_URDV] = { V_32, D_20, B_16, U8_8, 0, 0 }, diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 9e87b68be21c..ac06c3949ab3 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -199,9 +199,7 @@ void die(struct pt_regs *regs, const char *str) #ifdef CONFIG_PREEMPT pr_cont("PREEMPT "); #endif -#ifdef CONFIG_SMP pr_cont("SMP "); -#endif if (debug_pagealloc_enabled()) pr_cont("DEBUG_PAGEALLOC"); pr_cont("\n"); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3f4d272577d3..270d1d145761 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -986,14 +986,12 @@ ENTRY(psw_idle) stg %r3,__SF_EMPTY(%r15) larl %r1,.Lpsw_idle_lpsw+4 stg %r1,__SF_EMPTY+8(%r15) -#ifdef CONFIG_SMP larl %r1,smp_cpu_mtid llgf %r1,0(%r1) ltgr %r1,%r1 jz .Lpsw_idle_stcctm .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15) .Lpsw_idle_stcctm: -#endif oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT BPON STCK __CLOCK_IDLE_ENTER(%r2) @@ -1468,7 +1466,6 @@ ENDPROC(cleanup_critical) mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2) mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2) 1: # calculate idle cycles -#ifdef CONFIG_SMP clg %r9,BASED(.Lcleanup_idle_insn) jl 3f larl %r1,smp_cpu_mtid @@ -1486,7 +1483,6 @@ ENDPROC(cleanup_critical) la %r3,8(%r3) la %r4,8(%r4) brct %r1,2b -#endif 3: # account system time going idle lg %r9,__LC_STEAL_TIMER alg %r9,__CLOCK_IDLE_ENTER(%r2) diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 20420c2b8a14..b2956d49b6ad 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -63,7 +63,6 @@ void __init startup_init(void); void die(struct pt_regs *regs, const char *str); int setup_profiling_timer(unsigned int multiplier); void __init time_init(void); -int pfn_is_nosave(unsigned long); void s390_early_resume(void); unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 3f10b56bd5a3..ab584e8e3527 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -15,16 +15,11 @@ struct insn { s32 offset; } __packed; -struct insn_args { - struct jump_entry *entry; - enum jump_label_type type; -}; - static void jump_label_make_nop(struct jump_entry *entry, struct insn *insn) { - /* brcl 0,0 */ + /* brcl 0,offset */ insn->opcode = 0xc004; - insn->offset = 0; + insn->offset = (jump_entry_target(entry) - jump_entry_code(entry)) >> 1; } static void jump_label_make_branch(struct jump_entry *entry, struct insn *insn) @@ -77,23 +72,15 @@ static void __jump_label_transform(struct jump_entry *entry, s390_kernel_write(code, &new, sizeof(new)); } -static int __sm_arch_jump_label_transform(void *data) +static void __jump_label_sync(void *dummy) { - struct insn_args *args = data; - - __jump_label_transform(args->entry, args->type, 0); - return 0; } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - struct insn_args args; - - args.entry = entry; - args.type = type; - - stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL); + __jump_label_transform(entry, type, 0); + smp_call_function(__jump_label_sync, NULL, 1); } void arch_jump_label_transform_static(struct jump_entry *entry, diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 8a1ae140c5e2..444a19125a81 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -141,7 +141,6 @@ static noinline void __machine_kdump(void *image) */ store_status(__do_machine_kdump, image); } -#endif static unsigned long do_start_kdump(unsigned long addr) { @@ -155,6 +154,8 @@ static unsigned long do_start_kdump(unsigned long addr) return rc; } +#endif /* CONFIG_CRASH_DUMP */ + /* * Check if kdump checksums are valid: We call purgatory with parameter "0" */ diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 5de13307b703..6ebc2117c66c 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -7,6 +7,7 @@ #define KMSG_COMPONENT "cpu" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/stop_machine.h> #include <linux/cpufeature.h> #include <linux/bitops.h> #include <linux/kernel.h> @@ -31,6 +32,7 @@ struct cpu_info { }; static DEFINE_PER_CPU(struct cpu_info, cpu_info); +static DEFINE_PER_CPU(int, cpu_relax_retry); static bool machine_has_cpu_mhz; @@ -58,15 +60,20 @@ void s390_update_cpu_mhz(void) on_each_cpu(update_cpu_mhz, NULL, 0); } -void notrace cpu_relax_yield(void) +void notrace stop_machine_yield(const struct cpumask *cpumask) { - if (!smp_cpu_mtid && MACHINE_HAS_DIAG44) { - diag_stat_inc(DIAG_STAT_X044); - asm volatile("diag 0,0,0x44"); + int cpu, this_cpu; + + this_cpu = smp_processor_id(); + if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { + __this_cpu_write(cpu_relax_retry, 0); + cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + if (cpu >= nr_cpu_ids) + return; + if (arch_vcpu_is_preempted(cpu)) + smp_yield_cpu(cpu); } - barrier(); } -EXPORT_SYMBOL(cpu_relax_yield); /* * cpu_init - initializes state that is per-CPU. diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f8544d517430..2b94b0ad3588 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -461,11 +461,9 @@ static void __init setup_lowcore_dat_off(void) mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source); mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw); -#ifdef CONFIG_SMP lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; arch_spin_lock_setup(0); -#endif lc->br_r1_trampoline = 0x07f1; /* br %r1 */ set_prefix((u32)(unsigned long) lc); diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 22f08245aa5d..e6fca5498e1f 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -232,7 +232,7 @@ SYSCALL_DEFINE0(sigreturn) load_sigregs(); return regs->gprs[2]; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -256,7 +256,7 @@ SYSCALL_DEFINE0(rt_sigreturn) load_sigregs(); return regs->gprs[2]; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 35fafa2b91a8..44974654cbd0 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -232,8 +232,6 @@ out: return -ENOMEM; } -#ifdef CONFIG_HOTPLUG_CPU - static void pcpu_free_lowcore(struct pcpu *pcpu) { unsigned long async_stack, nodat_stack, lowcore; @@ -253,8 +251,6 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) free_pages(lowcore, LC_ORDER); } -#endif /* CONFIG_HOTPLUG_CPU */ - static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { struct lowcore *lc = pcpu->lowcore; @@ -418,7 +414,7 @@ void smp_yield_cpu(int cpu) diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" : : "d" (pcpu_devices[cpu].address)); - } else if (MACHINE_HAS_DIAG44) { + } else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) { diag_stat_inc_norecursion(DIAG_STAT_X044); asm volatile("diag 0,0,0x44"); } @@ -895,8 +891,6 @@ static int __init _setup_possible_cpus(char *s) } early_param("possible_cpus", _setup_possible_cpus); -#ifdef CONFIG_HOTPLUG_CPU - int __cpu_disable(void) { unsigned long cregs[16]; @@ -937,8 +931,6 @@ void __noreturn cpu_die(void) for (;;) ; } -#endif /* CONFIG_HOTPLUG_CPU */ - void __init smp_fill_possible_mask(void) { unsigned int possible, sclp_max, cpu; @@ -996,7 +988,6 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } -#ifdef CONFIG_HOTPLUG_CPU static ssize_t cpu_configure_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1073,7 +1064,6 @@ out: return rc ? rc : count; } static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); -#endif /* CONFIG_HOTPLUG_CPU */ static ssize_t show_cpu_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -1083,9 +1073,7 @@ static ssize_t show_cpu_address(struct device *dev, static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); static struct attribute *cpu_common_attrs[] = { -#ifdef CONFIG_HOTPLUG_CPU &dev_attr_configure.attr, -#endif &dev_attr_address.attr, NULL, }; @@ -1144,15 +1132,11 @@ static int smp_add_present_cpu(int cpu) out_topology: sysfs_remove_group(&s->kobj, &cpu_common_attr_group); out_cpu: -#ifdef CONFIG_HOTPLUG_CPU unregister_cpu(c); -#endif out: return rc; } -#ifdef CONFIG_HOTPLUG_CPU - int __ref smp_rescan_cpus(void) { struct sclp_core_info *info; @@ -1188,17 +1172,14 @@ static ssize_t __ref rescan_store(struct device *dev, return rc ? rc : count; } static DEVICE_ATTR_WO(rescan); -#endif /* CONFIG_HOTPLUG_CPU */ static int __init s390_smp_init(void) { int cpu, rc = 0; -#ifdef CONFIG_HOTPLUG_CPU rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan); if (rc) return rc; -#endif for_each_present_cpu(cpu) { rc = smp_add_present_cpu(cpu); if (rc) diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 19a3c427801a..a7baf0b5f818 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -162,7 +162,6 @@ ENTRY(swsusp_arch_resume) larl %r1,__swsusp_reset_dma lg %r1,0(%r1) BASR_EX %r14,%r1 -#ifdef CONFIG_SMP larl %r1,smp_cpu_mt_shift icm %r1,15,0(%r1) jz smt_done @@ -172,7 +171,6 @@ smt_loop: brc 8,smt_done /* accepted */ brc 2,smt_loop /* busy, try again */ smt_done: -#endif larl %r1,.Lnew_pgm_check_psw lpswe 0(%r1) pgm_check_entry: diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index e822b2964a83..6ebacfeaf853 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -436,3 +436,4 @@ 431 common fsconfig sys_fsconfig sys_fsconfig 432 common fsmount sys_fsmount sys_fsmount 433 common fspick sys_fspick sys_fspick +434 common pidfd_open sys_pidfd_open sys_pidfd_open diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 82e81a9f7112..164c0282b41a 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -45,7 +45,7 @@ int is_valid_bugaddr(unsigned long addr) void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { if (user_mode(regs)) { - force_sig_fault(si_signo, si_code, get_trap_ip(regs), current); + force_sig_fault(si_signo, si_code, get_trap_ip(regs)); report_user_fault(regs, si_signo, 0); } else { const struct exception_table_entry *fixup; @@ -79,7 +79,7 @@ void do_per_trap(struct pt_regs *regs) if (!current->ptrace) return; force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __force __user *) current->thread.per_event.address, current); + (void __force __user *) current->thread.per_event.address); } NOKPROBE_SYMBOL(do_per_trap); @@ -165,7 +165,7 @@ void illegal_op(struct pt_regs *regs) return; if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { if (current->ptrace) - force_sig_fault(SIGTRAP, TRAP_BRKPT, location, current); + force_sig_fault(SIGTRAP, TRAP_BRKPT, location); else signal = SIGILL; #ifdef CONFIG_UPROBES @@ -229,17 +229,11 @@ void vector_exception(struct pt_regs *regs) void data_exception(struct pt_regs *regs) { - int signal = 0; - save_fpu_regs(); if (current->thread.fpu.fpc & FPC_DXC_MASK) - signal = SIGFPE; - else - signal = SIGILL; - if (signal == SIGFPE) do_fp_trap(regs, current->thread.fpu.fpc); - else if (signal) - do_trap(regs, signal, ILL_ILLOPN, "data exception"); + else + do_trap(regs, SIGILL, ILL_ILLOPN, "data exception"); } void space_switch_exception(struct pt_regs *regs) diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 57fd4e902f1f..3ce8a0808059 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -46,18 +46,18 @@ bool unwind_next_frame(struct unwind_state *state) regs = state->regs; if (unlikely(regs)) { - sp = READ_ONCE_TASK_STACK(state->task, regs->gprs[15]); + sp = READ_ONCE_NOCHECK(regs->gprs[15]); if (unlikely(outside_of_stack(state, sp))) { if (!update_stack_info(state, sp)) goto out_err; } sf = (struct stack_frame *) sp; - ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; regs = NULL; } else { sf = (struct stack_frame *) state->sp; - sp = READ_ONCE_TASK_STACK(state->task, sf->back_chain); + sp = READ_ONCE_NOCHECK(sf->back_chain); if (likely(sp)) { /* Non-zero back-chain points to the previous frame */ if (unlikely(outside_of_stack(state, sp))) { @@ -65,7 +65,7 @@ bool unwind_next_frame(struct unwind_state *state) goto out_err; } sf = (struct stack_frame *) sp; - ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = true; } else { /* No back-chain, look for a pt_regs structure */ @@ -73,9 +73,9 @@ bool unwind_next_frame(struct unwind_state *state) if (!on_stack(info, sp, sizeof(struct pt_regs))) goto out_stop; regs = (struct pt_regs *) sp; - if (user_mode(regs)) + if (READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE) goto out_stop; - ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + ip = READ_ONCE_NOCHECK(regs->psw.addr); reliable = true; } } @@ -132,11 +132,11 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Get the instruction pointer from pt_regs or the stack frame */ if (regs) { - ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + ip = READ_ONCE_NOCHECK(regs->psw.addr); reliable = true; } else { sf = (struct stack_frame *) sp; - ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 28ebd647784c..3f520cd837fb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -227,6 +227,11 @@ int kvm_arch_hardware_enable(void) return 0; } +int kvm_arch_check_processor_compat(void) +{ + return 0; +} + static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); @@ -2418,13 +2423,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); if (!kvm->arch.sca) goto out_err; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); sca_offset += 16; if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) sca_offset = 0; kvm->arch.sca = (struct bsca_block *) ((char *) kvm->arch.sca + sca_offset); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); sprintf(debug_name, "kvm-%u", current->pid); @@ -2461,6 +2466,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_kvm_facility(kvm->arch.model.fac_list, 147); } + if (css_general_characteristics.aiv && test_facility(65)) + set_kvm_facility(kvm->arch.model.fac_mask, 65); + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8679bd74d337..ed52ffa8d5d4 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -27,6 +27,7 @@ #include <asm/io.h> #include <asm/ptrace.h> #include <asm/sclp.h> +#include <asm/ap.h> #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" @@ -592,6 +593,89 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) } } +/* + * handle_pqap: Handling pqap interception + * @vcpu: the vcpu having issue the pqap instruction + * + * We now support PQAP/AQIC instructions and we need to correctly + * answer the guest even if no dedicated driver's hook is available. + * + * The intercepting code calls a dedicated callback for this instruction + * if a driver did register one in the CRYPTO satellite of the + * SIE block. + * + * If no callback is available, the queues are not available, return this + * response code to the caller and set CC to 3. + * Else return the response code returned by the callback. + */ +static int handle_pqap(struct kvm_vcpu *vcpu) +{ + struct ap_queue_status status = {}; + unsigned long reg0; + int ret; + uint8_t fc; + + /* Verify that the AP instruction are available */ + if (!ap_instructions_available()) + return -EOPNOTSUPP; + /* Verify that the guest is allowed to use AP instructions */ + if (!(vcpu->arch.sie_block->eca & ECA_APIE)) + return -EOPNOTSUPP; + /* + * The only possibly intercepted functions when AP instructions are + * available for the guest are AQIC and TAPQ with the t bit set + * since we do not set IC.3 (FIII) we currently will only intercept + * the AQIC function code. + */ + reg0 = vcpu->run->s.regs.gprs[0]; + fc = (reg0 >> 24) & 0xff; + if (WARN_ON_ONCE(fc != 0x03)) + return -EOPNOTSUPP; + + /* PQAP instruction is allowed for guest kernel only */ + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + /* Common PQAP instruction specification exceptions */ + /* bits 41-47 must all be zeros */ + if (reg0 & 0x007f0000UL) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* APFT not install and T bit set */ + if (!test_kvm_facility(vcpu->kvm, 15) && (reg0 & 0x00800000UL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* APXA not installed and APID greater 64 or APQI greater 16 */ + if (!(vcpu->kvm->arch.crypto.crycbd & 0x02) && (reg0 & 0x0000c0f0UL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* AQIC function code specific exception */ + /* facility 65 not present for AQIC function code */ + if (!test_kvm_facility(vcpu->kvm, 65)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* + * Verify that the hook callback is registered, lock the owner + * and call the hook. + */ + if (vcpu->kvm->arch.crypto.pqap_hook) { + if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner)) + return -EOPNOTSUPP; + ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu); + module_put(vcpu->kvm->arch.crypto.pqap_hook->owner); + if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) + kvm_s390_set_psw_cc(vcpu, 3); + return ret; + } + /* + * A vfio_driver must register a hook. + * No hook means no driver to enable the SIE CRYCB and no queues. + * We send this response to the guest. + */ + status.response_code = 0x01; + memcpy(&vcpu->run->s.regs.gprs[1], &status, sizeof(status)); + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + static int handle_stfl(struct kvm_vcpu *vcpu) { int rc; @@ -878,6 +962,8 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) return handle_sthyi(vcpu); case 0x7d: return handle_stsi(vcpu); + case 0xaf: + return handle_pqap(vcpu); case 0xb1: return handle_stfl(vcpu); case 0xb2: diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 5418d10dc2a8..a1ec63abfb95 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,9 +3,8 @@ # Makefile for s390-specific library files.. # -lib-y += delay.o string.o uaccess.o find.o +lib-y += delay.o string.o uaccess.o find.o spinlock.o obj-y += mem.o xor.o -lib-$(CONFIG_SMP) += spinlock.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index df75d574246d..0ba174f779da 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -248,8 +248,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) { report_user_fault(regs, SIGSEGV, 1); force_sig_fault(SIGSEGV, si_code, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK), - current); + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } const struct exception_table_entry *s390_search_extables(unsigned long addr) @@ -310,8 +309,7 @@ static noinline void do_sigbus(struct pt_regs *regs) * or user mode. */ force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK), - current); + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } static noinline int signal_return(struct pt_regs *regs) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 14d1eae9fe43..f0bee6af3960 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -18,6 +18,7 @@ #include <linux/mman.h> #include <linux/mm.h> #include <linux/swap.h> +#include <linux/swiotlb.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/pagemap.h> @@ -29,6 +30,7 @@ #include <linux/export.h> #include <linux/cma.h> #include <linux/gfp.h> +#include <linux/dma-mapping.h> #include <asm/processor.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -42,6 +44,8 @@ #include <asm/sclp.h> #include <asm/set_memory.h> #include <asm/kasan.h> +#include <asm/dma-mapping.h> +#include <asm/uv.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); @@ -128,6 +132,47 @@ void mark_rodata_ro(void) pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); } +int set_memory_encrypted(unsigned long addr, int numpages) +{ + int i; + + /* make specified pages unshared, (swiotlb, dma_free) */ + for (i = 0; i < numpages; ++i) { + uv_remove_shared(addr); + addr += PAGE_SIZE; + } + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + int i; + /* make specified pages shared (swiotlb, dma_alloca) */ + for (i = 0; i < numpages; ++i) { + uv_set_shared(addr); + addr += PAGE_SIZE; + } + return 0; +} + +/* are we a protected virtualization guest? */ +bool sev_active(void) +{ + return is_prot_virt_guest(); +} + +/* protected virtualization */ +static void pv_init(void) +{ + if (!is_prot_virt_guest()) + return; + + /* make sure bounce buffers are shared */ + swiotlb_init(1); + swiotlb_update_mem_attributes(); + swiotlb_force = SWIOTLB_FORCE; +} + void __init mem_init(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); @@ -136,6 +181,8 @@ void __init mem_init(void) set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + pv_init(); + /* Setup guest page hinting */ cmma_init(); diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 818deeb1ebc3..1864a8bb9622 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -52,21 +52,22 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz * Therefore we have a read-modify-write sequence: the function reads eight * bytes from destination at an eight byte boundary, modifies the bytes * requested and writes the result back in a loop. - * - * Note: this means that this function may not be called concurrently on - * several cpus with overlapping words, since this may potentially - * cause data corruption. */ +static DEFINE_SPINLOCK(s390_kernel_write_lock); + void notrace s390_kernel_write(void *dst, const void *src, size_t size) { + unsigned long flags; long copied; + spin_lock_irqsave(&s390_kernel_write_lock, flags); while (size) { copied = s390_kernel_write_odd(dst, src, size); dst += copied; src += copied; size -= copied; } + spin_unlock_irqrestore(&s390_kernel_write_lock, flags); } static int __memcpy_real(void *dest, void *src, size_t count) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 687f2a4d3459..cbc718ba6d78 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -24,8 +24,6 @@ static unsigned long stack_maxrandom_size(void) { if (!(current->flags & PF_RANDOMIZE)) return 0; - if (current->personality & ADDR_NO_RANDOMIZE) - return 0; return STACK_RND_MASK << PAGE_SHIFT; } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 5e7c63033159..e636728ab452 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -299,9 +299,11 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT_ZERO(b1) \ ({ \ - /* llgfr %dst,%dst (zero extend to 64 bit) */ \ - EMIT4(0xb9160000, b1, b1); \ - REG_SET_SEEN(b1); \ + if (!fp->aux->verifier_zext) { \ + /* llgfr %dst,%dst (zero extend to 64 bit) */ \ + EMIT4(0xb9160000, b1, b1); \ + REG_SET_SEEN(b1); \ + } \ }) /* @@ -520,6 +522,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */ /* llgfr %dst,%src */ EMIT4(0xb9160000, dst_reg, src_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ /* lgr %dst,%src */ @@ -528,6 +532,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */ /* llilf %dst,imm */ EMIT6_IMM(0xc00f0000, dst_reg, imm); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case BPF_ALU64 | BPF_MOV | BPF_K: /* dst = imm */ /* lgfi %dst,imm */ @@ -639,6 +645,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT4(0xb9970000, REG_W0, src_reg); /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; } case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */ @@ -676,6 +684,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT_CONST_U32(imm)); /* llgfr %dst,%rc */ EMIT4(0xb9160000, dst_reg, rc_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; } case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */ @@ -864,10 +874,13 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case 16: /* dst = (u16) cpu_to_be16(dst) */ /* llghr %dst,%dst */ EMIT4(0xb9850000, dst_reg, dst_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case 32: /* dst = (u32) cpu_to_be32(dst) */ - /* llgfr %dst,%dst */ - EMIT4(0xb9160000, dst_reg, dst_reg); + if (!fp->aux->verifier_zext) + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); break; case 64: /* dst = (u64) cpu_to_be64(dst) */ break; @@ -882,12 +895,15 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT4_DISP(0x88000000, dst_reg, REG_0, 16); /* llghr %dst,%dst */ EMIT4(0xb9850000, dst_reg, dst_reg); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case 32: /* dst = (u32) cpu_to_le32(dst) */ /* lrvr %dst,%dst */ EMIT4(0xb91f0000, dst_reg, dst_reg); - /* llgfr %dst,%dst */ - EMIT4(0xb9160000, dst_reg, dst_reg); + if (!fp->aux->verifier_zext) + /* llgfr %dst,%dst */ + EMIT4(0xb9160000, dst_reg, dst_reg); break; case 64: /* dst = (u64) cpu_to_le64(dst) */ /* lrvgr %dst,%dst */ @@ -968,16 +984,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* llgc %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ /* llgh %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ /* llgf %dst,off(%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); + if (insn_is_zext(&insn[1])) + insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ /* lg %dst,0(off,%src) */ @@ -1282,6 +1304,11 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) return 0; } +bool bpf_jit_needs_zext(void) +{ + return true; +} + /* * Compile eBPF program "fp" */ diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 86ca7f88fb22..b8a64cbb5dea 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -421,12 +421,12 @@ static void zpci_map_resources(struct pci_dev *pdev) if (!len) continue; - if (static_branch_likely(&have_mio)) + if (zpci_use_mio(zdev)) pdev->resource[i].start = (resource_size_t __force) zdev->bars[i].mio_wb; else - pdev->resource[i].start = - (resource_size_t __force) pci_iomap(pdev, i, 0); + pdev->resource[i].start = (resource_size_t __force) + pci_iomap_range_fh(pdev, i, 0, 0); pdev->resource[i].end = pdev->resource[i].start + len - 1; } @@ -444,18 +444,19 @@ static void zpci_map_resources(struct pci_dev *pdev) static void zpci_unmap_resources(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); resource_size_t len; int i; - if (static_branch_likely(&have_mio)) + if (zpci_use_mio(zdev)) return; for (i = 0; i < PCI_BAR_COUNT; i++) { len = pci_resource_len(pdev, i); if (!len) continue; - pci_iounmap(pdev, (void __iomem __force *) - pdev->resource[i].start); + pci_iounmap_fh(pdev, (void __iomem __force *) + pdev->resource[i].start); } } @@ -528,7 +529,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev, if (zdev->bars[i].val & 4) flags |= IORESOURCE_MEM_64; - if (static_branch_likely(&have_mio)) + if (zpci_use_mio(zdev)) addr = (unsigned long) zdev->bars[i].mio_wb; else addr = ZPCI_ADDR(entry); diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index d03631dba7c2..9bdff4defef1 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -291,7 +291,7 @@ int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as) goto out; zdev->fh = fh; - if (zdev->mio_capable) { + if (zpci_use_mio(zdev)) { rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO); zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); if (rc) diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 6b48ca7760a7..3408c0df3ebf 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -74,7 +74,7 @@ static void pci_sw_counter_show(struct seq_file *m) int i; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) - seq_printf(m, "%26s:\t%lu\n", pci_sw_names[i], + seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], atomic64_read(counter)); } diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore index e9e66f178a6d..04a03433c720 100644 --- a/arch/s390/purgatory/.gitignore +++ b/arch/s390/purgatory/.gitignore @@ -1,2 +1,3 @@ -kexec-purgatory.c +purgatory +purgatory.lds purgatory.ro diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 2342b84b3386..b5e35e8f999a 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -6,7 +6,6 @@ kapi := arch/$(ARCH)/include/generated/asm kapi-hdrs-y := $(kapi)/facility-defs.h $(kapi)/dis-defs.h -targets += $(addprefix ../../../,$(kapi-hdrs-y)) PHONY += kapi kapi: $(kapi-hdrs-y) @@ -14,11 +13,7 @@ kapi: $(kapi-hdrs-y) hostprogs-y += gen_facilities hostprogs-y += gen_opcode_table -HOSTCFLAGS_gen_facilities.o += -Wall $(LINUXINCLUDE) -HOSTCFLAGS_gen_opcode_table.o += -Wall $(LINUXINCLUDE) - -# Ensure output directory exists -_dummy := $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') +HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE) filechk_facility-defs.h = $(obj)/gen_facilities diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 64638b764d1c..46d8ed96cf06 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -520,6 +520,9 @@ b92e km RRE_RR b92f kmc RRE_RR b930 cgfr RRE_RR b931 clgfr RRE_RR +b938 sortl RRE_RR +b939 dfltcc RRF_R0RR2 +b93a kdsa RRE_RR b93c ppno RRE_RR b93e kimd RRE_RR b93f klmd RRE_RR @@ -538,8 +541,16 @@ b95a cxlgtr RRF_UUFR b95b cxlftr RRF_UUFR b960 cgrt RRF_U0RR b961 clgrt RRF_U0RR +b964 nngrk RRF_R0RR2 +b965 ocgrk RRF_R0RR2 +b966 nogrk RRF_R0RR2 +b967 nxgrk RRF_R0RR2 b972 crt RRF_U0RR b973 clrt RRF_U0RR +b974 nnrk RRF_R0RR2 +b975 ocrk RRF_R0RR2 +b976 nork RRF_R0RR2 +b977 nxrk RRF_R0RR2 b980 ngr RRE_RR b981 ogr RRE_RR b982 xgr RRE_RR @@ -573,6 +584,7 @@ b99f ssair RRE_R0 b9a0 clp RRF_U0RR b9a1 tpei RRE_RR b9a2 ptf RRE_R0 +b9a4 uvc RRF_URR b9aa lptea RRF_RURR2 b9ab essa RRF_U0RR b9ac irbm RRE_RR @@ -585,6 +597,7 @@ b9b3 cu42 RRE_RR b9bd trtre RRF_U0RR b9be srstu RRE_RR b9bf trte RRF_U0RR +b9c0 selhhhr RRF_RURR b9c8 ahhhr RRF_R0RR2 b9c9 shhhr RRF_R0RR2 b9ca alhhhr RRF_R0RR2 @@ -594,6 +607,9 @@ b9cf clhhr RRE_RR b9d0 pcistg RRE_RR b9d2 pcilg RRE_RR b9d3 rpcit RRE_RR +b9d4 pcistgi RRE_RR +b9d5 pciwb RRE_00 +b9d6 pcilgi RRE_RR b9d8 ahhlr RRF_R0RR2 b9d9 shhlr RRF_R0RR2 b9da alhhlr RRF_R0RR2 @@ -601,9 +617,11 @@ b9db slhhlr RRF_R0RR2 b9dd chlr RRE_RR b9df clhlr RRE_RR b9e0 locfhr RRF_U0RR -b9e1 popcnt RRE_RR +b9e1 popcnt RRF_U0RR b9e2 locgr RRF_U0RR +b9e3 selgr RRF_RURR b9e4 ngrk RRF_R0RR2 +b9e5 ncgrk RRF_R0RR2 b9e6 ogrk RRF_R0RR2 b9e7 xgrk RRF_R0RR2 b9e8 agrk RRF_R0RR2 @@ -612,8 +630,10 @@ b9ea algrk RRF_R0RR2 b9eb slgrk RRF_R0RR2 b9ec mgrk RRF_R0RR2 b9ed msgrkc RRF_R0RR2 +b9f0 selr RRF_RURR b9f2 locr RRF_U0RR b9f4 nrk RRF_R0RR2 +b9f5 ncrk RRF_R0RR2 b9f6 ork RRF_R0RR2 b9f7 xrk RRF_R0RR2 b9f8 ark RRF_R0RR2 @@ -822,6 +842,7 @@ e3d4 stpcifc RXY_RRRD e500 lasp SSE_RDRD e501 tprot SSE_RDRD e502 strag SSE_RDRD +e50a mvcrl SSE_RDRD e50e mvcsk SSE_RDRD e50f mvcdk SSE_RDRD e544 mvhhi SIL_RDI @@ -835,6 +856,18 @@ e55c chsi SIL_RDI e55d clfhsi SIL_RDU e560 tbegin SIL_RDU e561 tbeginc SIL_RDU +e601 vlebrh VRX_VRRDU +e602 vlebrg VRX_VRRDU +e603 vlebrf VRX_VRRDU +e604 vllebrz VRX_VRRDU +e605 vlbrrep VRX_VRRDU +e606 vlbr VRX_VRRDU +e607 vler VRX_VRRDU +e609 vstebrh VRX_VRRDU +e60a vstebrg VRX_VRRDU +e60b vstebrf VRX_VRRDU +e60e vstbr VRX_VRRDU +e60f vster VRX_VRRDU e634 vpkz VSI_URDV e635 vlrl VSI_URDV e637 vlrlr VRS_RRDV @@ -842,8 +875,8 @@ e63c vupkz VSI_URDV e63d vstrl VSI_URDV e63f vstrlr VRS_RRDV e649 vlip VRI_V0UU2 -e650 vcvb VRR_RV0U -e652 vcvbg VRR_RV0U +e650 vcvb VRR_RV0UU +e652 vcvbg VRR_RV0UU e658 vcvd VRI_VR0UU e659 vsrp VRI_VVUUU2 e65a vcvdg VRI_VR0UU @@ -863,13 +896,13 @@ e702 vleg VRX_VRRDU e703 vlef VRX_VRRDU e704 vllez VRX_VRRDU e705 vlrep VRX_VRRDU -e706 vl VRX_VRRD +e706 vl VRX_VRRDU e707 vlbb VRX_VRRDU e708 vsteb VRX_VRRDU e709 vsteh VRX_VRRDU e70a vsteg VRX_VRRDU e70b vstef VRX_VRRDU -e70e vst VRX_VRRD +e70e vst VRX_VRRDU e712 vgeg VRV_VVXRDU e713 vgef VRV_VVXRDU e71a vsceg VRV_VVXRDU @@ -879,11 +912,11 @@ e722 vlvg VRS_VRRDU e727 lcbb RXE_RRRDU e730 vesl VRS_VVRDU e733 verll VRS_VVRDU -e736 vlm VRS_VVRD +e736 vlm VRS_VVRDU e737 vll VRS_VRRD e738 vesrl VRS_VVRDU e73a vesra VRS_VVRDU -e73e vstm VRS_VVRD +e73e vstm VRS_VVRDU e73f vstl VRS_VRRD e740 vleib VRI_V0IU e741 vleih VRI_V0IU @@ -932,7 +965,10 @@ e781 vfene VRR_VVV0U0U e782 vfae VRR_VVV0U0U e784 vpdi VRR_VVV0U e785 vbperm VRR_VVV +e786 vsld VRI_VVV0U +e787 vsrd VRI_VVV0U e78a vstrc VRR_VVVUU0V +e78b vstrs VRR_VVVUU0V e78c vperm VRR_VVV0V e78d vsel VRR_VVV0V e78e vfms VRR_VVVU0UV @@ -1060,6 +1096,7 @@ eb9b stamy RSY_AARD ebc0 tp RSL_R0RD ebd0 pcistb RSY_RRRD ebd1 sic RSY_RRRD +ebd4 pcistbi RSY_RRRD ebdc srak RSY_RRRD ebdd slak RSY_RRRD ebde srlk RSY_RRRD diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b77f512bb176..31a7d12db705 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config SUPERH def_bool y + select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_MIGHT_HAVE_PC_PARPORT @@ -14,6 +15,7 @@ config SUPERH select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS select HAVE_DEBUG_BUGVERBOSE + select HAVE_FAST_GUP if MMU select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_GCOV_PROFILE_ALL @@ -63,6 +65,7 @@ config SUPERH config SUPERH32 def_bool "$(ARCH)" = "sh" select ARCH_32BIT_OFF_T + select GUP_GET_PTE_LOW_HIGH if X2TLB select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_IOREMAP_PROT if MMU && !X2TLB @@ -623,7 +626,7 @@ config CRASH_DUMP to a memory address not used by the main kernel using PHYSICAL_START. - For more details see Documentation/kdump/kdump.txt + For more details see Documentation/kdump/kdump.rst config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig index 4dcf7f552582..91d43e2bffea 100644 --- a/arch/sh/configs/hp6xx_defconfig +++ b/arch/sh/configs/hp6xx_defconfig @@ -40,7 +40,6 @@ CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_HIT=y CONFIG_FB_SH_MOBILE_LCDC=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FONTS=y CONFIG_FONT_PEARL_8x8=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 5209889765ad..49a29338789b 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -191,7 +191,6 @@ CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=m CONFIG_JFFS2_FS_XATTR=y CONFIG_UBIFS_FS=m -CONFIG_LOGFS=m CONFIG_CRAMFS=m CONFIG_SQUASHFS=m CONFIG_ROMFS_FS=m diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index 5a1097641247..1e116529735f 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -63,7 +63,6 @@ CONFIG_NET_SCH_NETEM=y CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_FW=y -CONFIG_NET_CLS_IND=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_BLOCK=y diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index 9c0ef13bee10..c66e512719ab 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -62,7 +62,6 @@ CONFIG_NET_SCH_NETEM=y CONFIG_NET_CLS_TCINDEX=y CONFIG_NET_CLS_ROUTE4=y CONFIG_NET_CLS_FW=y -CONFIG_NET_CLS_IND=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_BLOCK=y diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig index a1cf6447dbb1..cbd6742eb423 100644 --- a/arch/sh/configs/sh2007_defconfig +++ b/arch/sh/configs/sh2007_defconfig @@ -85,7 +85,6 @@ CONFIG_WATCHDOG=y CONFIG_SH_WDT=y CONFIG_SSB=y CONFIG_FB=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index 822fa9e96f74..171ab05ce4fc 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -142,7 +142,6 @@ CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_CLS_IND=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_FW_LOADER=m CONFIG_CONNECTOR=m diff --git a/arch/sh/include/asm/flat.h b/arch/sh/include/asm/flat.h index 843d458b8329..fee4f25555cb 100644 --- a/arch/sh/include/asm/flat.h +++ b/arch/sh/include/asm/flat.h @@ -11,11 +11,8 @@ #include <asm/unaligned.h> -#define flat_argvp_envp_on_stack() 0 -#define flat_old_ram_flag(flags) (flags) -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) + u32 *addr) { *addr = get_unaligned((__force u32 *)rp); return 0; @@ -25,8 +22,6 @@ static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel) put_unaligned(addr, (__force u32 *)rp); return 0; } -#define flat_get_relocate_addr(rel) (rel) -#define flat_set_persistent(relval, p) ({ (void)p; 0; }) #define FLAT_PLAT_INIT(_r) \ do { _r->regs[0]=0; _r->regs[1]=0; _r->regs[2]=0; _r->regs[3]=0; \ diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index c28e37a344ad..ac0561960c52 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -369,7 +369,11 @@ static inline int iounmap_fixed(void __iomem *addr) { return -EINVAL; } #define ioremap_nocache ioremap #define ioremap_uc ioremap -#define iounmap __iounmap + +static inline void iounmap(void __iomem *addr) +{ + __iounmap(addr); +} /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 7d8587eb65ff..779260b721ca 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -38,6 +38,9 @@ static inline unsigned long pud_page_vaddr(pud_t pud) return pud_val(pud); } +/* only used by the stubbed out hugetlb gup code, should never be called */ +#define pud_page(pud) NULL + #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 3587103afe59..9085d1142fa3 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -149,6 +149,43 @@ extern void paging_init(void); extern void page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd); +static inline bool __pte_access_permitted(pte_t pte, u64 prot) +{ + return (pte_val(pte) & (prot | _PAGE_SPECIAL)) == prot; +} + +#ifdef CONFIG_X2TLB +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot = _PAGE_PRESENT; + + prot |= _PAGE_EXT(_PAGE_EXT_KERN_READ | _PAGE_EXT_USER_READ); + if (write) + prot |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE); + return __pte_access_permitted(pte, prot); +} +#elif defined(CONFIG_SUPERH64) +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; + + if (write) + prot |= _PAGE_WRITE; + return __pte_access_permitted(pte, prot); +} +#else +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot = _PAGE_PRESENT | _PAGE_USER; + + if (write) + prot |= _PAGE_RW; + return __pte_access_permitted(pte, prot); +} +#endif + +#define pte_access_permitted pte_access_permitted + /* arch/sh/mm/mmap.c */ #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN diff --git a/arch/sh/kernel/cpu/sh2a/fpu.c b/arch/sh/kernel/cpu/sh2a/fpu.c index 74b48db86dd7..0bcff11a4843 100644 --- a/arch/sh/kernel/cpu/sh2a/fpu.c +++ b/arch/sh/kernel/cpu/sh2a/fpu.c @@ -568,5 +568,5 @@ BUILD_TRAP_HANDLER(fpu_error) return; } - force_sig(SIGFPE, tsk); + force_sig(SIGFPE); } diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c index 1ff56e5ba990..03ffd8cdf542 100644 --- a/arch/sh/kernel/cpu/sh4/fpu.c +++ b/arch/sh/kernel/cpu/sh4/fpu.c @@ -421,5 +421,5 @@ BUILD_TRAP_HANDLER(fpu_error) } } - force_sig(SIGFPE, tsk); + force_sig(SIGFPE); } diff --git a/arch/sh/kernel/cpu/sh5/fpu.c b/arch/sh/kernel/cpu/sh5/fpu.c index 9218d9ed787e..3966b5ee8e93 100644 --- a/arch/sh/kernel/cpu/sh5/fpu.c +++ b/arch/sh/kernel/cpu/sh5/fpu.c @@ -100,9 +100,7 @@ void restore_fpu(struct task_struct *tsk) asmlinkage void do_fpu_error(unsigned long ex, struct pt_regs *regs) { - struct task_struct *tsk = current; - regs->pc += 4; - force_sig(SIGFPE, tsk); + force_sig(SIGFPE); } diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c index bc96b16288c1..3bd010b4c55f 100644 --- a/arch/sh/kernel/hw_breakpoint.c +++ b/arch/sh/kernel/hw_breakpoint.c @@ -338,7 +338,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) /* Deliver the signal to userspace */ if (!arch_check_bp_in_kernelspace(&bp->hw.info)) { force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __user *)NULL, current); + (void __user *)NULL); } rcu_read_unlock(); diff --git a/arch/sh/kernel/kdebugfs.c b/arch/sh/kernel/kdebugfs.c index 95428e05d212..8b505e1556a5 100644 --- a/arch/sh/kernel/kdebugfs.c +++ b/arch/sh/kernel/kdebugfs.c @@ -9,9 +9,6 @@ EXPORT_SYMBOL(arch_debugfs_dir); static int __init arch_kdebugfs_init(void) { arch_debugfs_dir = debugfs_create_dir("sh", NULL); - if (!arch_debugfs_dir) - return -ENOMEM; - return 0; } arch_initcall(arch_kdebugfs_init); diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index 3390349ff976..11085e48eaa6 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -550,7 +550,7 @@ asmlinkage void do_single_step(unsigned long long vec, struct pt_regs *regs) continually stepping. */ local_irq_enable(); regs->sr &= ~SR_SSTEP; - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } /* Called with interrupts disabled */ @@ -561,7 +561,7 @@ BUILD_TRAP_HANDLER(breakpoint) /* We need to forward step the PC, to counteract the backstep done in signal.c. */ local_irq_enable(); - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); regs->pc += 4; } diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 2a2121ba8ebe..24473fa6c3b6 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -176,7 +176,7 @@ asmlinkage int sys_sigreturn(void) return r0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -207,7 +207,7 @@ asmlinkage int sys_rt_sigreturn(void) return r0; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index f1f1598879c2..b9aaa9266b34 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -277,7 +277,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3, return (int) ret; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -311,7 +311,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3, return (int) ret; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 016a727d4357..834c9c7d79fa 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -436,3 +436,4 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 8b49cced663d..63cf17bc760d 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -141,7 +141,7 @@ BUILD_TRAP_HANDLER(debug) SIGTRAP) == NOTIFY_STOP) return; - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } /* @@ -167,7 +167,7 @@ BUILD_TRAP_HANDLER(bug) } #endif - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } BUILD_TRAP_HANDLER(nmi) diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index f2a18b5fafd8..058c6181bb30 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -533,7 +533,7 @@ uspace_segv: "access (PC %lx PR %lx)\n", current->comm, regs->pc, regs->pr); - force_sig_fault(SIGBUS, si_code, (void __user *)address, current); + force_sig_fault(SIGBUS, si_code, (void __user *)address); } else { inc_unaligned_kernel_access(); @@ -603,7 +603,7 @@ asmlinkage void do_divide_error(unsigned long r4) /* Let gcc know unhandled cases don't make it past here */ return; } - force_sig_fault(SIGFPE, code, NULL, current); + force_sig_fault(SIGFPE, code, NULL); } #endif @@ -611,7 +611,6 @@ asmlinkage void do_reserved_inst(void) { struct pt_regs *regs = current_pt_regs(); unsigned long error_code; - struct task_struct *tsk = current; #ifdef CONFIG_SH_FPU_EMU unsigned short inst = 0; @@ -633,7 +632,7 @@ asmlinkage void do_reserved_inst(void) /* Enable DSP mode, and restart instruction. */ regs->sr |= SR_DSP; /* Save DSP mode */ - tsk->thread.dsp_status.status |= SR_DSP; + current->thread.dsp_status.status |= SR_DSP; return; } #endif @@ -641,7 +640,7 @@ asmlinkage void do_reserved_inst(void) error_code = lookup_exception_vector(); local_irq_enable(); - force_sig(SIGILL, tsk); + force_sig(SIGILL); die_if_no_fixup("reserved instruction", regs, error_code); } @@ -697,7 +696,6 @@ asmlinkage void do_illegal_slot_inst(void) { struct pt_regs *regs = current_pt_regs(); unsigned long inst; - struct task_struct *tsk = current; if (kprobe_handle_illslot(regs->pc) == 0) return; @@ -716,7 +714,7 @@ asmlinkage void do_illegal_slot_inst(void) inst = lookup_exception_vector(); local_irq_enable(); - force_sig(SIGILL, tsk); + force_sig(SIGILL); die_if_no_fixup("illegal slot instruction", regs, inst); } diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 8ce90a7da67d..37046f3a26d3 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -599,7 +599,7 @@ static void do_unhandled_exception(int signr, char *str, unsigned long error, struct pt_regs *regs) { if (user_mode(regs)) - force_sig(signr, current); + force_sig(signr); die_if_no_fixup(str, regs, error); } diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c index a0fa8fc88739..e8be0eca0444 100644 --- a/arch/sh/math-emu/math.c +++ b/arch/sh/math-emu/math.c @@ -560,7 +560,7 @@ static int ieee_fpe_handler(struct pt_regs *regs) task_thread_info(tsk)->status |= TS_USEDFPU; } else { force_sig_fault(SIGFPE, FPE_FLTINV, - (void __user *)regs->pc, tsk); + (void __user *)regs->pc); } regs->pc = nextpc; diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile index fbe5e79751b3..5051b38fd5b6 100644 --- a/arch/sh/mm/Makefile +++ b/arch/sh/mm/Makefile @@ -17,7 +17,7 @@ cacheops-$(CONFIG_CPU_SHX3) += cache-shx3.o obj-y += $(cacheops-y) mmu-y := nommu.o extable_32.o -mmu-$(CONFIG_MMU) := extable_$(BITS).o fault.o gup.o ioremap.o kmap.o \ +mmu-$(CONFIG_MMU) := extable_$(BITS).o fault.o ioremap.o kmap.o \ pgtable.o tlbex_$(BITS).o tlbflush_$(BITS).o obj-y += $(mmu-y) diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c index e5539e0f8e3b..4c1ca197e9c5 100644 --- a/arch/sh/mm/asids-debugfs.c +++ b/arch/sh/mm/asids-debugfs.c @@ -63,13 +63,8 @@ static const struct file_operations asids_debugfs_fops = { static int __init asids_debugfs_init(void) { - struct dentry *asids_dentry; - - asids_dentry = debugfs_create_file("asids", S_IRUSR, arch_debugfs_dir, - NULL, &asids_debugfs_fops); - if (!asids_dentry) - return -ENOMEM; - - return PTR_ERR_OR_ZERO(asids_dentry); + debugfs_create_file("asids", S_IRUSR, arch_debugfs_dir, NULL, + &asids_debugfs_fops); + return 0; } device_initcall(asids_debugfs_init); diff --git a/arch/sh/mm/cache-debugfs.c b/arch/sh/mm/cache-debugfs.c index 4eb9d43578b4..17d780794497 100644 --- a/arch/sh/mm/cache-debugfs.c +++ b/arch/sh/mm/cache-debugfs.c @@ -109,22 +109,10 @@ static const struct file_operations cache_debugfs_fops = { static int __init cache_debugfs_init(void) { - struct dentry *dcache_dentry, *icache_dentry; - - dcache_dentry = debugfs_create_file("dcache", S_IRUSR, arch_debugfs_dir, - (unsigned int *)CACHE_TYPE_DCACHE, - &cache_debugfs_fops); - if (!dcache_dentry) - return -ENOMEM; - - icache_dentry = debugfs_create_file("icache", S_IRUSR, arch_debugfs_dir, - (unsigned int *)CACHE_TYPE_ICACHE, - &cache_debugfs_fops); - if (!icache_dentry) { - debugfs_remove(dcache_dentry); - return -ENOMEM; - } - + debugfs_create_file("dcache", S_IRUSR, arch_debugfs_dir, + (void *)CACHE_TYPE_DCACHE, &cache_debugfs_fops); + debugfs_create_file("icache", S_IRUSR, arch_debugfs_dir, + (void *)CACHE_TYPE_ICACHE, &cache_debugfs_fops); return 0; } module_init(cache_debugfs_init); diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 6defd2c6d9b1..3093bc372138 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -39,10 +39,9 @@ static inline int notify_page_fault(struct pt_regs *regs, int trap) } static void -force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) +force_sig_info_fault(int si_signo, int si_code, unsigned long address) { - force_sig_fault(si_signo, si_code, (void __user *)address, tsk); + force_sig_fault(si_signo, si_code, (void __user *)address); } /* @@ -244,8 +243,6 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, int si_code) { - struct task_struct *tsk = current; - /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { /* @@ -253,7 +250,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, */ local_irq_enable(); - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address); return; } @@ -308,7 +305,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) if (!user_mode(regs)) no_context(regs, error_code, address); - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); + force_sig_info_fault(SIGBUS, BUS_ADRERR, address); } static noinline int diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c deleted file mode 100644 index 277c882f7489..000000000000 --- a/arch/sh/mm/gup.c +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for SuperH - * - * Copyright (C) 2009 - 2010 Paul Mundt - * - * Cloned from the x86 and PowerPC versions, by: - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/highmem.h> -#include <asm/pgtable.h> - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X2TLB - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without - * taking any locks. For this we would like to load the pointers - * atomically, but that is not possible with 64-bit PTEs. What - * we do have is the guarantee that a pte will only either go - * from not present to present, or present to not present or both - * -- it will not switch to a completely different present page - * without a TLB flush in between; something that we are blocking - * by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - u64 mask, result; - pte_t *ptep; - -#ifdef CONFIG_X2TLB - result = _PAGE_PRESENT | _PAGE_EXT(_PAGE_EXT_KERN_READ | _PAGE_EXT_USER_READ); - if (write) - result |= _PAGE_EXT(_PAGE_EXT_KERN_WRITE | _PAGE_EXT_USER_WRITE); -#elif defined(CONFIG_SUPERH64) - result = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; - if (write) - result |= _PAGE_WRITE; -#else - result = _PAGE_PRESENT | _PAGE_USER; - if (write) - result |= _PAGE_RW; -#endif - - mask = result | _PAGE_SPECIAL; - - ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - if ((pte_val(pte) & mask) != result) { - pte_unmap(ptep); - return 0; - } - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - __flush_anon_page(page, addr); - flush_dcache_page(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - pte_unmap(ptep - 1); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, - pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, pages, - gup_flags); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index a53a040d0054..b59bad86b31e 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -861,13 +861,8 @@ static const struct file_operations pmb_debugfs_fops = { static int __init pmb_debugfs_init(void) { - struct dentry *dentry; - - dentry = debugfs_create_file("pmb", S_IFREG | S_IRUGO, - arch_debugfs_dir, NULL, &pmb_debugfs_fops); - if (!dentry) - return -ENOMEM; - + debugfs_create_file("pmb", S_IFREG | S_IRUGO, arch_debugfs_dir, NULL, + &pmb_debugfs_fops); return 0; } subsys_initcall(pmb_debugfs_init); diff --git a/arch/sh/mm/tlb-debugfs.c b/arch/sh/mm/tlb-debugfs.c index dea637a09246..11c6148283f3 100644 --- a/arch/sh/mm/tlb-debugfs.c +++ b/arch/sh/mm/tlb-debugfs.c @@ -149,22 +149,10 @@ static const struct file_operations tlb_debugfs_fops = { static int __init tlb_debugfs_init(void) { - struct dentry *itlb, *utlb; - - itlb = debugfs_create_file("itlb", S_IRUSR, arch_debugfs_dir, - (unsigned int *)TLB_TYPE_ITLB, - &tlb_debugfs_fops); - if (unlikely(!itlb)) - return -ENOMEM; - - utlb = debugfs_create_file("utlb", S_IRUSR, arch_debugfs_dir, - (unsigned int *)TLB_TYPE_UTLB, - &tlb_debugfs_fops); - if (unlikely(!utlb)) { - debugfs_remove(itlb); - return -ENOMEM; - } - + debugfs_create_file("itlb", S_IRUSR, arch_debugfs_dir, + (void *)TLB_TYPE_ITLB, &tlb_debugfs_fops); + debugfs_create_file("utlb", S_IRUSR, arch_debugfs_dir, + (void *)TLB_TYPE_UTLB, &tlb_debugfs_fops); return 0; } module_init(tlb_debugfs_init); diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 26ab6f5bbaaf..e9f5d62e9817 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -28,6 +28,7 @@ config SPARC select RTC_DRV_M48T59 select RTC_SYSTOHC select HAVE_ARCH_JUMP_LABEL if SPARC64 + select HAVE_FAST_GUP if SPARC64 select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_PCI_IOMAP @@ -300,9 +301,6 @@ config NODES_SPAN_OTHER_NODES def_bool y depends on NEED_MULTIPLE_NODES -config ARCH_SELECT_MEMORY_MODEL - def_bool y if SPARC64 - config ARCH_SPARSEMEM_ENABLE def_bool y if SPARC64 select SPARSEMEM_VMEMMAP_ENABLE diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 6963482c81d8..b60448397d4f 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -23,15 +23,15 @@ #define ATOMIC_OP(op) \ void atomic_##op(int, atomic_t *); \ -void atomic64_##op(long, atomic64_t *); +void atomic64_##op(s64, atomic64_t *); #define ATOMIC_OP_RETURN(op) \ int atomic_##op##_return(int, atomic_t *); \ -long atomic64_##op##_return(long, atomic64_t *); +s64 atomic64_##op##_return(s64, atomic64_t *); #define ATOMIC_FETCH_OP(op) \ int atomic_fetch_##op(int, atomic_t *); \ -long atomic64_fetch_##op(long, atomic64_t *); +s64 atomic64_fetch_##op(s64, atomic64_t *); #define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op) ATOMIC_FETCH_OP(op) @@ -61,7 +61,7 @@ static inline int atomic_xchg(atomic_t *v, int new) ((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n))) #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -long atomic64_dec_if_positive(atomic64_t *v); +s64 atomic64_dec_if_positive(atomic64_t *v); #define atomic64_dec_if_positive atomic64_dec_if_positive #endif /* !(__ARCH_SPARC64_ATOMIC__) */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 22500c3be7a9..1599de730532 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -864,6 +864,9 @@ static inline unsigned long pud_page_vaddr(pud_t pud) #define pgd_present(pgd) (pgd_val(pgd) != 0U) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) +/* only used by the stubbed out hugetlb gup code, should never be called */ +#define pgd_page(pgd) NULL + static inline unsigned long pud_large(pud_t pud) { pte_t pte = __pte(pud_val(pud)); @@ -1075,6 +1078,46 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, } #define io_remap_pfn_range io_remap_pfn_range +static inline unsigned long untagged_addr(unsigned long start) +{ + if (adi_capable()) { + long addr = start; + + /* If userspace has passed a versioned address, kernel + * will not find it in the VMAs since it does not store + * the version tags in the list of VMAs. Storing version + * tags in list of VMAs is impractical since they can be + * changed any time from userspace without dropping into + * kernel. Any address search in VMAs will be done with + * non-versioned addresses. Ensure the ADI version bits + * are dropped here by sign extending the last bit before + * ADI bits. IOMMU does not implement version tags. + */ + return (addr << (long)adi_nbits()) >> (long)adi_nbits(); + } + + return start; +} +#define untagged_addr untagged_addr + +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + u64 prot; + + if (tlb_type == hypervisor) { + prot = _PAGE_PRESENT_4V | _PAGE_P_4V; + if (write) + prot |= _PAGE_WRITE_4V; + } else { + prot = _PAGE_PRESENT_4U | _PAGE_P_4U; + if (write) + prot |= _PAGE_WRITE_4U; + } + + return (pte_val(pte) & (prot | _PAGE_SPECIAL)) == prot; +} +#define pte_access_permitted pte_access_permitted + #include <asm/tlbflush.h> #include <asm-generic/pgtable.h> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 9265a9eece15..8029b681fc7c 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -115,6 +115,8 @@ #define SO_RCVTIMEO_NEW 0x0044 #define SO_SNDTIMEO_NEW 0x0045 +#define SO_DETACH_REUSEPORT_BPF 0x0047 + #if !defined(__KERNEL__) diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 59eaf6227af1..4282116e28e7 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -519,7 +519,7 @@ void synchronize_user_stack(void) static void stack_unaligned(unsigned long sp) { - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0); } static const char uwfault32[] = KERN_INFO \ @@ -570,7 +570,7 @@ void fault_in_user_windows(struct pt_regs *regs) barf: set_thread_wsaved(window + 1); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } asmlinkage long sparc_do_fork(unsigned long clone_flags, diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index e800ce13cc6e..a237810aa9f4 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -170,7 +170,7 @@ void do_sigreturn32(struct pt_regs *regs) return; segv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } asmlinkage void do_rt_sigreturn32(struct pt_regs *regs) @@ -256,7 +256,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs) set_current_blocked(&set); return; segv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize) @@ -375,7 +375,7 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs, pr_info("%s[%d] bad frame in setup_frame32: %08lx TPC %08lx O7 %08lx\n", current->comm, current->pid, (unsigned long)sf, regs->tpc, regs->u_regs[UREG_I7]); - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return -EINVAL; } @@ -509,7 +509,7 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs, pr_info("%s[%d] bad frame in setup_rt_frame32: %08lx TPC %08lx O7 %08lx\n", current->comm, current->pid, (unsigned long)sf, regs->tpc, regs->u_regs[UREG_I7]); - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return -EINVAL; } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 83953780ca01..42c3de313fd6 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -137,7 +137,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs) return; segv_and_exit: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } asmlinkage void do_rt_sigreturn(struct pt_regs *regs) @@ -196,7 +196,7 @@ asmlinkage void do_rt_sigreturn(struct pt_regs *regs) set_current_blocked(&set); return; segv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize) diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index ca70787efd8e..69ae814b7e90 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -134,7 +134,7 @@ out: exception_exit(prev_state); return; do_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); goto out; } @@ -228,7 +228,7 @@ out: exception_exit(prev_state); return; do_sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); goto out; } @@ -320,7 +320,7 @@ void do_rt_sigreturn(struct pt_regs *regs) set_current_blocked(&set); return; segv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize) @@ -374,7 +374,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) pr_info("%s[%d] bad frame in setup_rt_frame: %016lx TPC %016lx O7 %016lx\n", current->comm, current->pid, (unsigned long)sf, regs->tpc, regs->u_regs[UREG_I7]); - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); return -EINVAL; } diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 452e4d080855..be77538bc038 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -151,7 +151,7 @@ sparc_breakpoint (struct pt_regs *regs) #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc); #endif - force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0, current); + force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%x nPC=%x\n", regs->pc, regs->npc); diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 9825ca6a6020..ccc88926bc00 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -511,7 +511,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs) #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc); #endif - force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0, current); + force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc); #endif diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index e047480b1605..c58e71f21129 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -479,3 +479,4 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index bcdfc6168dd5..4ceecad556a9 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -103,7 +103,7 @@ void do_hw_interrupt(struct pt_regs *regs, unsigned long type) die_if_kernel("Kernel bad trap", regs); force_sig_fault(SIGILL, ILL_ILLTRP, - (void __user *)regs->pc, type - 0x80, current); + (void __user *)regs->pc, type - 0x80); } void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc, @@ -327,7 +327,7 @@ void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n", pc, npc, psr); #endif - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0, current); + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0); } void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long npc, diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 04aa588d5dd1..27778b65a965 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -108,7 +108,7 @@ void bad_trap(struct pt_regs *regs, long lvl) regs->tnpc &= 0xffffffff; } force_sig_fault(SIGILL, ILL_ILLTRP, - (void __user *)regs->tpc, lvl, current); + (void __user *)regs->tpc, lvl); } void bad_trap_tl1(struct pt_regs *regs, long lvl) @@ -202,7 +202,7 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un regs->tnpc &= 0xffffffff; } force_sig_fault(SIGSEGV, SEGV_MAPERR, - (void __user *)regs->tpc, 0, current); + (void __user *)regs->tpc, 0); out: exception_exit(prev_state); } @@ -237,7 +237,7 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0, current); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0); } void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx) @@ -322,7 +322,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un if (is_no_fault_exception(regs)) return; - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0, current); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0); out: exception_exit(prev_state); } @@ -386,16 +386,13 @@ void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsig */ switch (type) { case HV_FAULT_TYPE_INV_ASI: - force_sig_fault(SIGILL, ILL_ILLADR, (void __user *)addr, 0, - current); + force_sig_fault(SIGILL, ILL_ILLADR, (void __user *)addr, 0); break; case HV_FAULT_TYPE_MCD_DIS: - force_sig_fault(SIGSEGV, SEGV_ACCADI, (void __user *)addr, 0, - current); + force_sig_fault(SIGSEGV, SEGV_ACCADI, (void __user *)addr, 0); break; default: - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)addr, 0, - current); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)addr, 0); break; } } @@ -572,7 +569,7 @@ static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned lon regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0, current); + force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0); } void spitfire_access_error(struct pt_regs *regs, unsigned long status_encoded, unsigned long afar) @@ -2074,7 +2071,7 @@ void do_mcd_err(struct pt_regs *regs, struct sun4v_error_entry ent) * code */ force_sig_fault(SIGSEGV, SEGV_ADIDERR, (void __user *)ent.err_raddr, - 0, current); + 0); } /* We run with %pil set to PIL_NORMAL_MAX and PSTATE_IE enabled in %pstate. @@ -2182,13 +2179,13 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs, addr += PAGE_SIZE; } } - force_sig(SIGKILL, current); + force_sig(SIGKILL); return true; } if (attrs & SUN4V_ERR_ATTRS_PIO) { force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)sun4v_get_vaddr(regs), 0, current); + (void __user *)sun4v_get_vaddr(regs), 0); return true; } @@ -2345,7 +2342,7 @@ static void do_fpe_common(struct pt_regs *regs) code = FPE_FLTRES; } force_sig_fault(SIGFPE, code, - (void __user *)regs->tpc, 0, current); + (void __user *)regs->tpc, 0); } } @@ -2400,7 +2397,7 @@ void do_tof(struct pt_regs *regs) regs->tnpc &= 0xffffffff; } force_sig_fault(SIGEMT, EMT_TAGOVF, - (void __user *)regs->tpc, 0, current); + (void __user *)regs->tpc, 0); out: exception_exit(prev_state); } @@ -2420,7 +2417,7 @@ void do_div0(struct pt_regs *regs) regs->tnpc &= 0xffffffff; } force_sig_fault(SIGFPE, FPE_INTDIV, - (void __user *)regs->tpc, 0, current); + (void __user *)regs->tpc, 0); out: exception_exit(prev_state); } @@ -2616,7 +2613,7 @@ void do_illegal_instruction(struct pt_regs *regs) } } } - force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current); + force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0); out: exception_exit(prev_state); } @@ -2636,7 +2633,7 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo if (is_no_fault_exception(regs)) return; - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0); out: exception_exit(prev_state); } @@ -2654,7 +2651,7 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c if (is_no_fault_exception(regs)) return; - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0); } /* sun4v_mem_corrupt_detect_precise() - Handle precise exception on an ADI @@ -2701,7 +2698,7 @@ void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr, regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGSEGV, SEGV_ADIPERR, (void __user *)addr, 0, current); + force_sig_fault(SIGSEGV, SEGV_ADIPERR, (void __user *)addr, 0); } void do_privop(struct pt_regs *regs) @@ -2717,7 +2714,7 @@ void do_privop(struct pt_regs *regs) regs->tnpc &= 0xffffffff; } force_sig_fault(SIGILL, ILL_PRVOPC, - (void __user *)regs->tpc, 0, current); + (void __user *)regs->tpc, 0); out: exception_exit(prev_state); } diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile index d39075b1e3b7..b078205b70e0 100644 --- a/arch/sparc/mm/Makefile +++ b/arch/sparc/mm/Makefile @@ -5,7 +5,7 @@ asflags-y := -ansi ccflags-y := -Werror -obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o gup.o +obj-$(CONFIG_SPARC64) += ultra.o tlb.o tsb.o obj-y += fault_$(BITS).o obj-y += init_$(BITS).o obj-$(CONFIG_SPARC32) += extable.o srmmu.o iommu.o io-unit.o diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index b0440b0edd97..8d69de111470 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -131,7 +131,7 @@ static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs, show_signal_msg(regs, sig, code, addr, current); - force_sig_fault(sig, code, (void __user *) addr, 0, current); + force_sig_fault(sig, code, (void __user *) addr, 0); } static unsigned long compute_si_addr(struct pt_regs *regs, int text_fault) @@ -425,7 +425,7 @@ do_sigbus: static void check_stack_aligned(unsigned long sp) { if (sp & 0x7UL) - force_sig(SIGILL, current); + force_sig(SIGILL); } void window_overflow_fault(void) diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 8f8a604c1300..83fda4d9c3b2 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -187,7 +187,7 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs, if (unlikely(show_unhandled_signals)) show_signal_msg(regs, sig, code, addr, current); - force_sig_fault(sig, code, (void __user *) addr, 0, current); + force_sig_fault(sig, code, (void __user *) addr, 0); } static unsigned int get_fault_insn(struct pt_regs *regs, unsigned int insn) diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c deleted file mode 100644 index 1e770a517d4a..000000000000 --- a/arch/sparc/mm/gup.c +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for sparc, cribbed from powerpc - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/pagemap.h> -#include <linux/rwsem.h> -#include <asm/pgtable.h> -#include <asm/adi.h> - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask, result; - pte_t *ptep; - - if (tlb_type == hypervisor) { - result = _PAGE_PRESENT_4V|_PAGE_P_4V; - if (write) - result |= _PAGE_WRITE_4V; - } else { - result = _PAGE_PRESENT_4U|_PAGE_P_4U; - if (write) - result |= _PAGE_WRITE_4U; - } - mask = result | _PAGE_SPECIAL; - - ptep = pte_offset_kernel(&pmd, addr); - do { - struct page *page, *head; - pte_t pte = *ptep; - - if ((pte_val(pte) & mask) != result) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - /* The hugepage case is simplified on sparc64 because - * we encode the sub-page pfn offsets into the - * hugepage PTEs. We could optimize this in the future - * use page_cache_add_speculative() for the hugepage case. - */ - page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(head); - return 0; - } - - pages[*nr] = page; - (*nr)++; - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, - int *nr) -{ - struct page *head, *page; - int refs; - - if (!(pmd_val(pmd) & _PAGE_VALID)) - return 0; - - if (write && !pmd_write(pmd)) - return 0; - - refs = 0; - page = pmd_page(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - head = compound_head(page); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, - int *nr) -{ - struct page *head, *page; - int refs; - - if (!(pud_val(pud) & _PAGE_VALID)) - return 0; - - if (write && !pud_write(pud)) - return 0; - - refs = 0; - page = pud_page(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - head = compound_head(page); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pud_val(pud) != pud_val(*pudp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd))) { - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmd, addr, next, write, - pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pudp, pud, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp; - int nr = 0; - -#ifdef CONFIG_SPARC64 - if (adi_capable()) { - long addr = start; - - /* If userspace has passed a versioned address, kernel - * will not find it in the VMAs since it does not store - * the version tags in the list of VMAs. Storing version - * tags in list of VMAs is impractical since they can be - * changed any time from userspace without dropping into - * kernel. Any address search in VMAs will be done with - * non-versioned addresses. Ensure the ADI version bits - * are dropped here by sign extending the last bit before - * ADI bits. IOMMU does not implement version tags. - */ - addr = (addr << (long)adi_nbits()) >> (long)adi_nbits(); - start = addr; - } -#endif - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - -#ifdef CONFIG_SPARC64 - if (adi_capable()) { - long addr = start; - - /* If userspace has passed a versioned address, kernel - * will not find it in the VMAs since it does not store - * the version tags in the list of VMAs. Storing version - * tags in list of VMAs is impractical since they can be - * changed any time from userspace without dropping into - * kernel. Any address search in VMAs will be done with - * non-versioned addresses. Ensure the ADI version bits - * are dropped here by sign extending the last bit before - * ADI bits. IOMMU does not implements version tags, - */ - addr = (addr << (long)adi_nbits()) >> (long)adi_nbits(); - start = addr; - } -#endif - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables from being freed on sparc. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_disable(); - - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, - pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); - - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, pages, - gup_flags); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 65428e79b2f3..3364e2a00989 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -908,6 +908,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: emit_alu3_K(SRL, src, 0, dst, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_ALU64 | BPF_MOV | BPF_X: emit_reg_move(src, dst, ctx); @@ -942,6 +944,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_ALU | BPF_DIV | BPF_X: emit_write_y(G0, ctx); emit_alu(DIV, src, dst, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_ALU64 | BPF_DIV | BPF_X: emit_alu(UDIVX, src, dst, ctx); @@ -975,6 +979,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; case BPF_ALU | BPF_RSH | BPF_X: emit_alu(SRL, src, dst, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_ALU64 | BPF_RSH | BPF_X: emit_alu(SRLX, src, dst, ctx); @@ -997,9 +1003,12 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case 16: emit_alu_K(SLL, dst, 16, ctx); emit_alu_K(SRL, dst, 16, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case 32: - emit_alu_K(SRL, dst, 0, ctx); + if (!ctx->prog->aux->verifier_zext) + emit_alu_K(SRL, dst, 0, ctx); break; case 64: /* nop */ @@ -1021,6 +1030,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_alu3_K(AND, dst, 0xff, dst, ctx); emit_alu3_K(SLL, tmp, 8, tmp, ctx); emit_alu(OR, tmp, dst, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case 32: @@ -1037,6 +1048,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_alu3_K(AND, dst, 0xff, dst, ctx); /* dst = dst & 0xff */ emit_alu3_K(SLL, dst, 24, dst, ctx); /* dst = dst << 24 */ emit_alu(OR, tmp, dst, ctx); /* dst = dst | tmp */ + if (insn_is_zext(&insn[1])) + return 1; break; case 64: @@ -1050,6 +1063,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* dst = imm */ case BPF_ALU | BPF_MOV | BPF_K: emit_loadimm32(imm, dst, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_ALU64 | BPF_MOV | BPF_K: emit_loadimm_sext(imm, dst, ctx); @@ -1132,6 +1147,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; case BPF_ALU | BPF_RSH | BPF_K: emit_alu_K(SRL, dst, imm, ctx); + if (insn_is_zext(&insn[1])) + return 1; break; case BPF_ALU64 | BPF_RSH | BPF_K: emit_alu_K(SRLX, dst, imm, ctx); @@ -1144,7 +1161,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; do_alu32_trunc: - if (BPF_CLASS(code) == BPF_ALU) + if (BPF_CLASS(code) == BPF_ALU && + !ctx->prog->aux->verifier_zext) emit_alu_K(SRL, dst, 0, ctx); break; @@ -1265,6 +1283,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) rs2 = RS2(tmp); } emit(opcode | RS1(src) | rs2 | RD(dst), ctx); + if (opcode != LD64 && insn_is_zext(&insn[1])) + return 1; break; } /* ST: *(size *)(dst + off) = imm */ @@ -1432,6 +1452,11 @@ static void jit_fill_hole(void *area, unsigned int size) *ptr++ = 0x91d02005; /* ta 5 */ } +bool bpf_jit_needs_zext(void) +{ + return true; +} + struct sparc64_jit_data { struct bpf_binary_header *header; u8 *image; diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 99eb5682792a..d7b282e9c4d5 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -10,6 +10,8 @@ #include <linux/mm.h> +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) __pa(pte))) @@ -25,20 +27,6 @@ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *); -extern pgtable_t pte_alloc_one(struct mm_struct *); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long) pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - #define __pte_free_tlb(tlb,pte, address) \ do { \ pgtable_page_dtor(pte); \ diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index a43d42bf0a86..783b9247161f 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -32,7 +32,7 @@ void flush_thread(void) if (ret) { printk(KERN_ERR "flush_thread - clearing address space failed, " "err = %d\n", ret); - force_sig(SIGKILL, current); + force_sig(SIGKILL); } get_safe_registers(current_pt_regs()->regs.gp, current_pt_regs()->regs.fp); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index a9c9a94c096f..de58e976b9bc 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -208,28 +208,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long) pgd); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte; - - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); - return pte; -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!pte) - return NULL; - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; -} - #ifdef CONFIG_3_LEVEL_PGTABLES pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 5f47422401e1..da1e96b1ec3e 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -112,13 +112,12 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs, - int error_code) +static void send_sigtrap(struct uml_pt_regs *regs, int error_code) { /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, TRAP_BRKPT, /* User-mode eip? */ - UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL, tsk); + UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL); } /* @@ -147,7 +146,7 @@ void syscall_trace_leave(struct pt_regs *regs) /* Fake a debug trap */ if (ptraced & PT_DTRACE) - send_sigtrap(current, ®s->regs, 0); + send_sigtrap(®s->regs, 0); if (!test_thread_flag(TIF_SYSCALL_TRACE)) return; diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 7a1f2a936fd1..29e7f5f9f188 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -119,7 +119,7 @@ void uml_setup_stubs(struct mm_struct *mm) return; out: - force_sigsegv(SIGSEGV, current); + force_sigsegv(SIGSEGV); } void arch_exit_mmap(struct mm_struct *mm) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 8347161c2ae0..45f739bf302f 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -329,7 +329,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, "process: %d\n", task_tgid_vnr(current)); /* We are under mmap_sem, release it such that current can terminate */ up_write(¤t->mm->mmap_sem); - force_sig(SIGKILL, current); + force_sig(SIGKILL); do_signal(¤t->thread.regs); } } @@ -487,7 +487,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) kill: printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); - force_sig(SIGKILL, current); + force_sig(SIGKILL); } pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address) diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 0e8b6158f224..58fe36856182 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -163,13 +163,12 @@ static void show_segv_info(struct uml_pt_regs *regs) static void bad_segv(struct faultinfo fi, unsigned long ip) { current->thread.arch.faultinfo = fi; - force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi), - current); + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi)); } void fatal_sigsegv(void) { - force_sigsegv(SIGSEGV, current); + force_sigsegv(SIGSEGV); do_signal(¤t->thread.regs); /* * This is to tell gcc that we're not returning - do_signal @@ -268,13 +267,11 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, if (err == -EACCES) { current->thread.arch.faultinfo = fi; - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, - current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } else { BUG_ON(err != -EFAULT); current->thread.arch.faultinfo = fi; - force_sig_fault(SIGSEGV, si_code, (void __user *) address, - current); + force_sig_fault(SIGSEGV, si_code, (void __user *) address); } out: @@ -304,12 +301,11 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) { struct faultinfo *fi = UPT_FAULTINFO(regs); current->thread.arch.faultinfo = *fi; - force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi), - current); + force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi)); } else { printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n", sig, code, err); - force_sig(sig, current); + force_sig(sig); } } diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index ec64834b1c6a..3f0903bd98e9 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -14,6 +14,10 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include <asm-generic/pgalloc.h> + #define check_pgt_cache() do { } while (0) #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) @@ -25,17 +29,14 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); #define pgd_alloc(mm) get_pgd_slow(mm) #define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) - /* * Allocate one PTE table. */ static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; + pte_t *pte = __pte_alloc_one_kernel(mm); - pte = (pte_t *)__get_free_page(PGALLOC_GFP); if (pte) clean_dcache_area(pte, PTRS_PER_PTE * sizeof(pte_t)); @@ -47,35 +48,14 @@ pte_alloc_one(struct mm_struct *mm) { struct page *pte; - pte = alloc_pages(PGALLOC_GFP, 0); + pte = __pte_alloc_one(mm, GFP_PGTABLE_USER); if (!pte) return NULL; - if (!PageHighMem(pte)) { - void *page = page_address(pte); - clean_dcache_area(page, PTRS_PER_PTE * sizeof(pte_t)); - } - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - } - + if (!PageHighMem(pte)) + clean_pte_table(page_address(pte)); return pte; } -/* - * Free one PTE table. - */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - if (pte) - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - static inline void __pmd_populate(pmd_t *pmdp, unsigned long pmdval) { set_pmd(pmdp, __pmd(pmdval)); diff --git a/arch/unicore32/include/mach/regs-gpio.h b/arch/unicore32/include/mach/regs-gpio.h index 806350e1ccb6..5fc701ee33e3 100644 --- a/arch/unicore32/include/mach/regs-gpio.h +++ b/arch/unicore32/include/mach/regs-gpio.h @@ -32,7 +32,7 @@ */ #define GPIO_GEDR (PKUNITY_GPIO_BASE + 0x0018) /* - * Sepcial Voltage Detect Reg GPIO_GPIR. + * Special Voltage Detect Reg GPIO_GPIR. */ #define GPIO_GPIR (PKUNITY_GPIO_BASE + 0x0020) diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index e62f82bd1339..3946182a835d 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -126,7 +126,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs) return regs->UCreg_00; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -383,7 +383,7 @@ static void do_signal(struct pt_regs *regs, int syscall) regs->UCreg_pc = KERN_RESTART_CODE; } else { regs->UCreg_sp += 4; - force_sigsegv(0, current); + force_sigsegv(0); } } if (regs->UCreg_00 == -ERESTARTNOHAND || diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 1c1f0ce20e19..e24f67283864 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -245,7 +245,7 @@ void uc32_notify_die(const char *str, struct pt_regs *regs, current->thread.error_code = err; current->thread.trap_no = trap; - force_sig_fault(sig, code, addr, current); + force_sig_fault(sig, code, addr); } else die(str, regs, err); } diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 33e0d8a267e8..76342de9cf8c 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -113,14 +113,15 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, * Something tried to access memory that isn't in our memory map.. * User mode accesses just cause a SIGSEGV */ -static void __do_user_fault(struct task_struct *tsk, unsigned long addr, - unsigned int fsr, unsigned int sig, int code, - struct pt_regs *regs) +static void __do_user_fault(unsigned long addr, unsigned int fsr, + unsigned int sig, int code, struct pt_regs *regs) { + struct task_struct *tsk = current; + tsk->thread.address = addr; tsk->thread.error_code = fsr; tsk->thread.trap_no = 14; - force_sig_fault(sig, code, (void __user *)addr, tsk); + force_sig_fault(sig, code, (void __user *)addr); } void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) @@ -133,7 +134,7 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * have no context to handle this fault with. */ if (user_mode(regs)) - __do_user_fault(tsk, addr, fsr, SIGSEGV, SEGV_MAPERR, regs); + __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); else __do_kernel_fault(mm, addr, fsr, regs); } @@ -307,7 +308,7 @@ retry: code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR; } - __do_user_fault(tsk, addr, fsr, sig, code, regs); + __do_user_fault(addr, fsr, sig, code, regs); return 0; no_context: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..9df2d1cb7a9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,6 +17,7 @@ config X86_32 select HAVE_DEBUG_STACKOVERFLOW select MODULES_USE_ELF_REL select OLD_SIGACTION + select GENERIC_VDSO_32 config X86_64 def_bool y @@ -121,6 +122,8 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select GENERIC_GETTIMEOFDAY + select GUP_GET_PTE_LOW_HIGH if X86_PAE select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI @@ -156,6 +159,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA select HAVE_EXIT_THREAD + select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER @@ -202,6 +206,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER + select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_SG_DMA_LENGTH @@ -217,6 +222,7 @@ config X86 select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select X86_FEATURE_NAMES if PROC_FS + select PROC_PID_ARCH_STATUS if PROC_FS config INSTRUCTION_DECODER def_bool y @@ -395,7 +401,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also <file:Documentation/x86/i386/IO-APIC.txt>, + See also <file:Documentation/x86/i386/IO-APIC.rst>, <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at <http://www.tldp.org/docs.html#howto>. @@ -781,6 +787,9 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. +config X86_HV_CALLBACK_VECTOR + def_bool n + source "arch/x86/xen/Kconfig" config KVM_GUEST @@ -832,6 +841,17 @@ config JAILHOUSE_GUEST cell. You can leave this option disabled if you only want to start Jailhouse and run Linux afterwards in the root cell. +config ACRN_GUEST + bool "ACRN Guest support" + depends on X86_64 + select X86_HV_CALLBACK_VECTOR + help + This option allows to run Linux as guest in the ACRN hypervisor. ACRN is + a flexible, lightweight reference open-source hypervisor, built with + real-time and safety-criticality in mind. It is built for embedded + IOT with small footprint and real-time features. More details can be + found in https://projectacrn.org/. + endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" @@ -1290,7 +1310,7 @@ config MICROCODE the Linux kernel. The preferred method to load microcode from a detached initrd is described - in Documentation/x86/microcode.txt. For that you need to enable + in Documentation/x86/microcode.rst. For that you need to enable CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the initrd for microcode blobs. @@ -1329,7 +1349,7 @@ config MICROCODE_OLD_INTERFACE It is inadequate because it runs too late to be able to properly load microcode on a machine and it needs special tools. Instead, you should've switched to the early loading method with the initrd or - builtin microcode by now: Documentation/x86/microcode.txt + builtin microcode by now: Documentation/x86/microcode.rst config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" @@ -1478,7 +1498,7 @@ config X86_5LEVEL A kernel with the option enabled can be booted on machines that support 4- or 5-level paging. - See Documentation/x86/x86_64/5level-paging.txt for more + See Documentation/x86/x86_64/5level-paging.rst for more information. Say N if unsure. @@ -1626,7 +1646,7 @@ config ARCH_MEMORY_PROBE depends on X86_64 && MEMORY_HOTPLUG help This option enables a sysfs memory/probe interface for testing. - See Documentation/memory-hotplug.txt for more information. + See Documentation/admin-guide/mm/memory-hotplug.rst for more information. If you are unsure how to answer this question, answer N. config ARCH_PROC_KCORE_TEXT @@ -1783,7 +1803,7 @@ config MTRR You can safely say Y even if your machine doesn't have MTRRs, you'll just add about 9 KB to your kernel. - See <file:Documentation/x86/mtrr.txt> for more information. + See <file:Documentation/x86/mtrr.rst> for more information. config MTRR_SANITIZER def_bool y @@ -1895,7 +1915,7 @@ config X86_INTEL_MPX process and adds some branches to paths used during exec() and munmap(). - For details, see Documentation/x86/intel_mpx.txt + For details, see Documentation/x86/intel_mpx.rst If unsure, say N. @@ -1911,7 +1931,7 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS page-based protections, but without requiring modification of the page tables when an application changes protection domains. - For details, see Documentation/x86/protection-keys.txt + For details, see Documentation/core-api/protection-keys.rst If unsure, say y. @@ -2037,7 +2057,7 @@ config CRASH_DUMP to a memory address not used by the main kernel or BIOS using PHYSICAL_START, or it must be built as a relocatable image (CONFIG_RELOCATABLE=y). - For more details see Documentation/kdump/kdump.txt + For more details see Documentation/kdump/kdump.rst config KEXEC_JUMP bool "kexec jump" @@ -2074,7 +2094,7 @@ config PHYSICAL_START the reserved region. In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed - kernel. Please take a look at Documentation/kdump/kdump.txt + kernel. Please take a look at Documentation/kdump/kdump.rst for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as @@ -2285,7 +2305,7 @@ config COMPAT_VDSO choice prompt "vsyscall table for legacy applications" depends on X86_64 - default LEGACY_VSYSCALL_EMULATE + default LEGACY_VSYSCALL_XONLY help Legacy user code that does not know how to find the vDSO expects to be able to issue three syscalls by calling fixed addresses in @@ -2293,23 +2313,38 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[emulate|none]. + line parameter vsyscall=[emulate|xonly|none]. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty to improve security. - If unsure, select "Emulate". + If unsure, select "Emulate execution only". config LEGACY_VSYSCALL_EMULATE - bool "Emulate" + bool "Full emulation" + help + The kernel traps and emulates calls into the fixed vsyscall + address mapping. This makes the mapping non-executable, but + it still contains readable known contents, which could be + used in certain rare security vulnerability exploits. This + configuration is recommended when using legacy userspace + that still uses vsyscalls along with legacy binary + instrumentation tools that require code to be readable. + + An example of this type of legacy userspace is running + Pin on an old binary that still uses vsyscalls. + + config LEGACY_VSYSCALL_XONLY + bool "Emulate execution only" help - The kernel traps and emulates calls into the fixed - vsyscall address mapping. This makes the mapping - non-executable, but it still contains known contents, - which could be used in certain rare security vulnerability - exploits. This configuration is recommended when userspace - still uses the vsyscall area. + The kernel traps and emulates calls into the fixed vsyscall + address mapping and does not allow reads. This + configuration is recommended when userspace might use the + legacy vsyscall area but support for legacy binary + instrumentation of legacy code is not needed. It mitigates + certain uses of the vsyscall area as an ASLR-bypassing + buffer. config LEGACY_VSYSCALL_NONE bool "None" @@ -2873,9 +2908,6 @@ config HAVE_ATOMIC_IOMAP config X86_DEV_DMA_OPS bool -config HAVE_GENERIC_GUP - def_bool y - source "drivers/firmware/Kconfig" source "arch/x86/kvm/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6adce15268bd..8e29c991ba3e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -480,3 +480,16 @@ config CPU_SUP_UMC_32 CPU might render the kernel unbootable. If unsure, say N. + +config CPU_SUP_ZHAOXIN + default y + bool "Support Zhaoxin processors" if PROCESSOR_SELECT + help + This enables detection, tunings and quirks for Zhaoxin processors + + You need this enabled if you want your kernel to run on a + Zhaoxin CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin + CPU might render the kernel unbootable. + + If unsure, say N. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index f730680dc818..71c92db47c41 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -156,7 +156,7 @@ config IOMMU_DEBUG code. When you use it make sure you have a big enough IOMMU/AGP aperture. Most of the options enabled by this can be set more finegrained using the iommu= command line - options. See Documentation/x86/x86_64/boot-options.txt for more + options. See Documentation/x86/x86_64/boot-options.rst for more details. config IOMMU_LEAK @@ -179,26 +179,6 @@ config X86_DECODER_SELFTEST decoder code. If unsure, say "N". -# -# IO delay types: -# - -config IO_DELAY_TYPE_0X80 - int - default "0" - -config IO_DELAY_TYPE_0XED - int - default "1" - -config IO_DELAY_TYPE_UDELAY - int - default "2" - -config IO_DELAY_TYPE_NONE - int - default "3" - choice prompt "IO delay type" default IO_DELAY_0X80 @@ -229,30 +209,6 @@ config IO_DELAY_NONE endchoice -if IO_DELAY_0X80 -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_0X80 -endif - -if IO_DELAY_0XED -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_0XED -endif - -if IO_DELAY_UDELAY -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_UDELAY -endif - -if IO_DELAY_NONE -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_NONE -endif - config DEBUG_BOOT_PARAMS bool "Debug boot parameters" depends on DEBUG_KERNEL diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index ad84239e595e..15255f388a85 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -44,17 +44,109 @@ static acpi_physical_address get_acpi_rsdp(void) return addr; } -/* Search EFI system tables for RSDP. */ -static acpi_physical_address efi_get_rsdp_addr(void) +/* + * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and + * ACPI_TABLE_GUID are found, take the former, which has more features. + */ +static acpi_physical_address +__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables, + bool efi_64) { acpi_physical_address rsdp_addr = 0; #ifdef CONFIG_EFI - unsigned long systab, systab_tables, config_tables; + int i; + + /* Get EFI tables from systab. */ + for (i = 0; i < nr_tables; i++) { + acpi_physical_address table; + efi_guid_t guid; + + if (efi_64) { + efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i; + + guid = tbl->guid; + table = tbl->table; + + if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { + debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); + return 0; + } + } else { + efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i; + + guid = tbl->guid; + table = tbl->table; + } + + if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) + rsdp_addr = table; + else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) + return table; + } +#endif + return rsdp_addr; +} + +/* EFI/kexec support is 64-bit only. */ +#ifdef CONFIG_X86_64 +static struct efi_setup_data *get_kexec_setup_data_addr(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params->hdr.setup_data; + while (pa_data) { + data = (struct setup_data *)pa_data; + if (data->type == SETUP_EFI) + return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); + + pa_data = data->next; + } + return NULL; +} + +static acpi_physical_address kexec_get_rsdp_addr(void) +{ + efi_system_table_64_t *systab; + struct efi_setup_data *esd; + struct efi_info *ei; + char *sig; + + esd = (struct efi_setup_data *)get_kexec_setup_data_addr(); + if (!esd) + return 0; + + if (!esd->tables) { + debug_putstr("Wrong kexec SETUP_EFI data.\n"); + return 0; + } + + ei = &boot_params->efi_info; + sig = (char *)&ei->efi_loader_signature; + if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + debug_putstr("Wrong kexec EFI loader signature.\n"); + return 0; + } + + /* Get systab from boot params. */ + systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32)); + if (!systab) + error("EFI system table not found in kexec boot_params."); + + return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true); +} +#else +static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } +#endif /* CONFIG_X86_64 */ + +static acpi_physical_address efi_get_rsdp_addr(void) +{ +#ifdef CONFIG_EFI + unsigned long systab, config_tables; unsigned int nr_tables; struct efi_info *ei; bool efi_64; - int size, i; char *sig; ei = &boot_params->efi_info; @@ -88,49 +180,20 @@ static acpi_physical_address efi_get_rsdp_addr(void) config_tables = stbl->tables; nr_tables = stbl->nr_tables; - size = sizeof(efi_config_table_64_t); } else { efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab; config_tables = stbl->tables; nr_tables = stbl->nr_tables; - size = sizeof(efi_config_table_32_t); } if (!config_tables) error("EFI config tables not found."); - /* Get EFI tables from systab. */ - for (i = 0; i < nr_tables; i++) { - acpi_physical_address table; - efi_guid_t guid; - - config_tables += size; - - if (efi_64) { - efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables; - - guid = tbl->guid; - table = tbl->table; - - if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { - debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); - return 0; - } - } else { - efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables; - - guid = tbl->guid; - table = tbl->table; - } - - if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) - rsdp_addr = table; - else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) - return table; - } + return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64); +#else + return 0; #endif - return rsdp_addr; } static u8 compute_checksum(u8 *buffer, u32 length) @@ -220,6 +283,14 @@ acpi_physical_address get_rsdp_addr(void) if (!pa) pa = boot_params->acpi_rsdp_addr; + /* + * Try to get EFI data from setup_data. This can happen when we're a + * kexec'ed kernel and kexec(1) has passed all the required EFI info to + * us. + */ + if (!pa) + pa = kexec_get_rsdp_addr(); + if (!pa) pa = efi_get_rsdp_addr(); diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fafb75c6c592..6233ae35d0d9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -659,6 +659,7 @@ no_longmode: gdt64: .word gdt_end - gdt .quad 0 + .balign 8 gdt: .word gdt_end - gdt .long gdt diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 5a237e8dbf8d..24e65a0f756d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -351,9 +351,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, /* Clear flags intended for solely in-kernel use. */ boot_params->hdr.loadflags &= ~KASLR_FLAG; - /* Save RSDP address for later use. */ - /* boot_params->acpi_rsdp_addr = get_rsdp_addr(); */ - sanitize_boot_params(boot_params); if (boot_params->screen_info.orig_video_mode == 7) { @@ -368,6 +365,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, cols = boot_params->screen_info.orig_video_cols; console_init(); + + /* + * Save RSDP address for later use. Have this after console_init() + * so that early debugging output from the RSDP parsing code can be + * collected. + */ + boot_params->acpi_rsdp_addr = get_rsdp_addr(); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 850b8762e889..2c11c0f45d49 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -313,7 +313,7 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just type_of_loader: .byte 0 # 0 means ancient bootloader, newer # bootloaders know to change this. - # See Documentation/x86/boot.txt for + # See Documentation/x86/boot.rst for # assigned ids # flags, unused bits must be zero (RFU) bit within loadflags @@ -419,7 +419,17 @@ xloadflags: # define XLF4 0 #endif - .word XLF0 | XLF1 | XLF23 | XLF4 +#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_5LEVEL +#define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED) +#else +#define XLF56 XLF_5LEVEL +#endif +#else +#define XLF56 0 +#endif + + .word XLF0 | XLF1 | XLF23 | XLF4 | XLF56 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 2b2481acc661..59ce9ed58430 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -130,7 +130,6 @@ CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index e8829abf063a..d0a5ffeae8df 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -129,7 +129,6 @@ CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index e9b866e87d48..73c0ccb009a0 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -371,20 +371,6 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) } } -static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - - aesni_enc(ctx, dst, src); -} - -static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - - aesni_dec(ctx, dst, src); -} - static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { @@ -920,7 +906,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req) } #endif -static struct crypto_alg aesni_algs[] = { { +static struct crypto_alg aesni_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-aesni", .cra_priority = 300, @@ -937,24 +923,7 @@ static struct crypto_alg aesni_algs[] = { { .cia_decrypt = aes_decrypt } } -}, { - .cra_name = "__aes", - .cra_driver_name = "__aes-aesni", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = CRYPTO_AES_CTX_SIZE, - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = __aes_encrypt, - .cia_decrypt = __aes_decrypt - } - } -} }; +}; static struct skcipher_alg aesni_skciphers[] = { { @@ -1150,7 +1119,7 @@ static int __init aesni_init(void) #endif #endif - err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + err = crypto_register_alg(&aesni_cipher_alg); if (err) return err; @@ -1158,7 +1127,7 @@ static int __init aesni_init(void) ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); if (err) - goto unregister_algs; + goto unregister_cipher; err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), aesni_simd_aeads); @@ -1170,8 +1139,8 @@ static int __init aesni_init(void) unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); -unregister_algs: - crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); +unregister_cipher: + crypto_unregister_alg(&aesni_cipher_alg); return err; } @@ -1181,7 +1150,7 @@ static void __exit aesni_exit(void) aesni_simd_aeads); simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); - crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + crypto_unregister_alg(&aesni_cipher_alg); } late_initcall(aesni_init); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 1ce0019c059c..388f95a4ec24 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -124,7 +124,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } static int chacha_simd_stream_xor(struct skcipher_walk *walk, - struct chacha_ctx *ctx, u8 *iv) + const struct chacha_ctx *ctx, const u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); int next_yield = 4096; /* bytes until next FPU yield */ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index efb0d1b1f15f..9f1f9e3b8230 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -172,21 +172,6 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -/* - * This is a sneaky trick to help the unwinder find pt_regs on the stack. The - * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also - * a signal to the unwinder that it's a pt_regs pointer in disguise. - * - * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts - * the original rbp. - */ -.macro ENCODE_FRAME_POINTER ptregs_offset=0 -#ifdef CONFIG_FRAME_POINTER - leaq 1+\ptregs_offset(%rsp), %rbp -#endif -.endm - #ifdef CONFIG_PAGE_TABLE_ISOLATION /* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2418804e66b4..536b574b6161 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -72,23 +72,18 @@ static long syscall_trace_enter(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); unsigned long ret = 0; - bool emulated = false; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + work = READ_ONCE(ti->flags); - if (unlikely(work & _TIF_SYSCALL_EMU)) - emulated = true; - - if ((emulated || (work & _TIF_SYSCALL_TRACE)) && - tracehook_report_syscall_entry(regs)) - return -1L; - - if (emulated) - return -1L; + if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = tracehook_report_syscall_entry(regs); + if (ret || (work & _TIF_SYSCALL_EMU)) + return -1L; + } #ifdef CONFIG_SECCOMP /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7b23431be5cb..90b473297299 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -67,7 +67,6 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all_kernel #endif .macro TRACE_IRQS_IRET @@ -203,9 +202,102 @@ .Lend_\@: .endm +#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) +#define CS_FROM_KERNEL (1 << 29) + +.macro FIXUP_FRAME + /* + * The high bits of the CS dword (__csh) are used for CS_FROM_*. + * Clear them in case hardware didn't do this for us. + */ + andl $0x0000ffff, 3*4(%esp) + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, 4*4(%esp) + jnz .Lfrom_usermode_no_fixup_\@ +#endif + testl $SEGMENT_RPL_MASK, 3*4(%esp) + jnz .Lfrom_usermode_no_fixup_\@ + + orl $CS_FROM_KERNEL, 3*4(%esp) + + /* + * When we're here from kernel mode; the (exception) stack looks like: + * + * 5*4(%esp) - <previous context> + * 4*4(%esp) - flags + * 3*4(%esp) - cs + * 2*4(%esp) - ip + * 1*4(%esp) - orig_eax + * 0*4(%esp) - gs / function + * + * Lets build a 5 entry IRET frame after that, such that struct pt_regs + * is complete and in particular regs->sp is correct. This gives us + * the original 5 enties as gap: + * + * 12*4(%esp) - <previous context> + * 11*4(%esp) - gap / flags + * 10*4(%esp) - gap / cs + * 9*4(%esp) - gap / ip + * 8*4(%esp) - gap / orig_eax + * 7*4(%esp) - gap / gs / function + * 6*4(%esp) - ss + * 5*4(%esp) - sp + * 4*4(%esp) - flags + * 3*4(%esp) - cs + * 2*4(%esp) - ip + * 1*4(%esp) - orig_eax + * 0*4(%esp) - gs / function + */ + + pushl %ss # ss + pushl %esp # sp (points at ss) + addl $6*4, (%esp) # point sp back at the previous context + pushl 6*4(%esp) # flags + pushl 6*4(%esp) # cs + pushl 6*4(%esp) # ip + pushl 6*4(%esp) # orig_eax + pushl 6*4(%esp) # gs / function +.Lfrom_usermode_no_fixup_\@: +.endm + +.macro IRET_FRAME + testl $CS_FROM_KERNEL, 1*4(%esp) + jz .Lfinished_frame_\@ + + /* + * Reconstruct the 3 entry IRET frame right after the (modified) + * regs->sp without lowering %esp in between, such that an NMI in the + * middle doesn't scribble our stack. + */ + pushl %eax + pushl %ecx + movl 5*4(%esp), %eax # (modified) regs->sp + + movl 4*4(%esp), %ecx # flags + movl %ecx, -4(%eax) + + movl 3*4(%esp), %ecx # cs + andl $0x0000ffff, %ecx + movl %ecx, -8(%eax) + + movl 2*4(%esp), %ecx # ip + movl %ecx, -12(%eax) + + movl 1*4(%esp), %ecx # eax + movl %ecx, -16(%eax) + + popl %ecx + lea -16(%eax), %esp + popl %eax +.Lfinished_frame_\@: +.endm + .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS + FIXUP_FRAME pushl %fs pushl %es pushl %ds @@ -247,22 +339,6 @@ .Lend_\@: .endm -/* - * This is a sneaky trick to help the unwinder find pt_regs on the stack. The - * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just clearing the MSB, which makes it an invalid stack address and is also - * a signal to the unwinder that it's a pt_regs pointer in disguise. - * - * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the - * original rbp. - */ -.macro ENCODE_FRAME_POINTER -#ifdef CONFIG_FRAME_POINTER - mov %esp, %ebp - andl $0x7fffffff, %ebp -#endif -.endm - .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -375,9 +451,6 @@ * switch to it before we do any copying. */ -#define CS_FROM_ENTRY_STACK (1 << 31) -#define CS_FROM_USER_CR3 (1 << 30) - .macro SWITCH_TO_KERNEL_STACK ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV @@ -391,13 +464,6 @@ * that register for the time this macro runs */ - /* - * The high bits of the CS dword (__csh) are used for - * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case - * hardware didn't do this for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -755,7 +821,7 @@ ret_from_intr: andl $SEGMENT_RPL_MASK, %eax #endif cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + jb restore_all_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) @@ -765,18 +831,6 @@ ENTRY(resume_userspace) jmp restore_all END(ret_from_exception) -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all_kernel - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all_kernel - call preempt_schedule_irq - jmp restore_all_kernel -END(resume_kernel) -#endif - GLOBAL(__begin_SYSENTER_singlestep_region) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1019,6 +1073,7 @@ restore_all: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: + IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1027,6 +1082,15 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: +#ifdef CONFIG_PREEMPT + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz .Lno_preempt + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz .Lno_preempt + call preempt_schedule_irq +.Lno_preempt: +#endif TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE BUG_IF_WRONG_CR3 @@ -1104,6 +1168,30 @@ ENTRY(irq_entries_start) .endr END(irq_entries_start) +#ifdef CONFIG_X86_LOCAL_APIC + .align 8 +ENTRY(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_spurious + .align 8 + .endr +END(spurious_entries_start) + +common_spurious: + ASM_CLAC + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + movl %esp, %eax + call smp_spurious_interrupt + jmp ret_from_intr +ENDPROC(common_spurious) +#endif + /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: @@ -1360,6 +1448,7 @@ END(page_fault) common_exception: /* the function address is in %gs's slot on the stack */ + FIXUP_FRAME pushl %fs pushl %es pushl %ds diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 11aa3b2afa4d..0ea4831a72a4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -8,7 +8,7 @@ * * entry.S contains the system-call and fault low-level handling routines. * - * Some of this is documented in Documentation/x86/entry_64.txt + * Some of this is documented in Documentation/x86/entry_64.rst * * A note on terminology: * - iret frame: Architecture defined interrupt frame from SS to RIP @@ -375,6 +375,18 @@ ENTRY(irq_entries_start) .endr END(irq_entries_start) + .align 8 +ENTRY(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + UNWIND_HINT_IRET_REGS + pushq $(~vector+0x80) /* Note: always in signed byte range */ + jmp common_spurious + .align 8 + vector=vector+1 + .endr +END(spurious_entries_start) + .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY pushq %rax @@ -571,10 +583,20 @@ _ASM_NOKPROBE(interrupt_entry) /* Interrupt entry/exit. */ - /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_interrupt. - */ +/* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_spurious/interrupt. + */ +common_spurious: + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call smp_spurious_interrupt /* rdi points to pt_regs */ + jmp ret_from_intr +END(common_spurious) +_ASM_NOKPROBE(common_spurious) + +/* common_interrupt is a hotpath. Align it */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ @@ -1142,6 +1164,11 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ +#if IS_ENABLED(CONFIG_ACRN_GUEST) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + acrn_hv_callback_vector acrn_hv_vector_handler +#endif + idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry stack_segment do_stack_segment has_error_code=1 @@ -1670,11 +1697,17 @@ nmi_restore: iretq END(nmi) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) +#endif ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ad968b7bac72..c00019abd076 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -438,3 +438,5 @@ 431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig 432 i386 fsmount sys_fsmount __ia32_sys_fsmount 433 i386 fspick sys_fspick __ia32_sys_fspick +434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open +435 i386 clone3 sys_clone3 __ia32_sys_clone3 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index b4e6f9e6204a..c29976eca4a8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -355,6 +355,8 @@ 431 common fsconfig __x64_sys_fsconfig 432 common fsmount __x64_sys_fsmount 433 common fspick __x64_sys_fspick +434 common pidfd_open __x64_sys_pidfd_open +435 common clone3 __x64_sys_clone3/ptregs # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 42fe42e82baf..39106111be86 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,6 +3,12 @@ # Building vDSO images for x86. # +# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before +# the inclusion of generic Makefile. +ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| +ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE +include $(srctree)/lib/vdso/Makefile + KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n UBSAN_SANITIZE := n @@ -51,6 +57,7 @@ VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) + $(call if_changed,vdso_check) HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi hostprogs-y += vdso2c @@ -121,6 +128,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) + $(call if_changed,vdso_check) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 @@ -160,6 +168,7 @@ $(obj)/vdso32.so.dbg: FORCE \ $(obj)/vdso32/system_call.o \ $(obj)/vdso32/sigreturn.o $(call if_changed,vdso) + $(call if_changed,vdso_check) # # The DSO images are built using a special linker script. diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 4aed41f638bb..d9ff616bb0f6 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -1,251 +1,85 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright 2006 Andi Kleen, SUSE Labs. - * * Fast user context implementation of clock_gettime, gettimeofday, and time. * + * Copyright 2006 Andi Kleen, SUSE Labs. + * Copyright 2019 ARM Limited + * * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net> * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * The code should have no internal unresolved relocations. - * Check with readelf after changing. */ - -#include <uapi/linux/time.h> -#include <asm/vgtod.h> -#include <asm/vvar.h> -#include <asm/unistd.h> -#include <asm/msr.h> -#include <asm/pvclock.h> -#include <asm/mshyperv.h> -#include <linux/math64.h> #include <linux/time.h> #include <linux/kernel.h> +#include <linux/types.h> -#define gtod (&VVAR(vsyscall_gtod_data)) +#include "../../../../lib/vdso/gettimeofday.c" -extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); -extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); -#ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page[PAGE_SIZE] - __attribute__((visibility("hidden"))); -#endif - -#ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page[PAGE_SIZE] - __attribute__((visibility("hidden"))); -#endif - -#ifndef BUILD_VDSO32 - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { - long ret; - asm ("syscall" : "=a" (ret), "=m" (*ts) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : - "rcx", "r11"); - return ret; + return __cvdso_gettimeofday(tv, tz); } -#else +int gettimeofday(struct __kernel_old_timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +time_t __vdso_time(time_t *t) { - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[clock], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*ts) - : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) - : "edx"); - return ret; + return __cvdso_time(t); } -#endif +time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); -#ifdef CONFIG_PARAVIRT_CLOCK -static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) -{ - return (const struct pvclock_vsyscall_time_info *)&pvclock_page; -} -static notrace u64 vread_pvclock(void) -{ - const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - u32 version; - u64 ret; - - /* - * Note: The kernel and hypervisor must guarantee that cpu ID - * number maps 1:1 to per-CPU pvclock time info. - * - * Because the hypervisor is entirely unaware of guest userspace - * preemption, it cannot guarantee that per-CPU pvclock time - * info is updated if the underlying CPU changes or that that - * version is increased whenever underlying CPU changes. - * - * On KVM, we are guaranteed that pvti updates for any vCPU are - * atomic as seen by *all* vCPUs. This is an even stronger - * guarantee than we get with a normal seqlock. - * - * On Xen, we don't appear to have that guarantee, but Xen still - * supplies a valid seqlock using the version field. - * - * We only do pvclock vdso timing at all if - * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to - * mean that all vCPUs have matching pvti and that the TSC is - * synced, so we can just look at vCPU 0's pvti. - */ - - do { - version = pvclock_read_begin(pvti); - - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) - return U64_MAX; - - ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); - } while (pvclock_read_retry(pvti, version)); - - return ret; -} -#endif -#ifdef CONFIG_HYPERV_TSCPAGE -static notrace u64 vread_hvclock(void) -{ - const struct ms_hyperv_tsc_page *tsc_pg = - (const struct ms_hyperv_tsc_page *)&hvclock_page; +#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64) +/* both 64-bit and x32 use these */ +extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); +extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res); - return hv_read_tsc_page(tsc_pg); -} -#endif - -notrace static inline u64 vgetcyc(int mode) +int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { - if (mode == VCLOCK_TSC) - return (u64)rdtsc_ordered(); - - /* - * For any memory-mapped vclock type, we need to make sure that gcc - * doesn't cleverly hoist a load before the mode check. Otherwise we - * might end up touching the memory-mapped page even if the vclock in - * question isn't enabled, which will segfault. Hence the barriers. - */ -#ifdef CONFIG_PARAVIRT_CLOCK - if (mode == VCLOCK_PVCLOCK) { - barrier(); - return vread_pvclock(); - } -#endif -#ifdef CONFIG_HYPERV_TSCPAGE - if (mode == VCLOCK_HVCLOCK) { - barrier(); - return vread_hvclock(); - } -#endif - return U64_MAX; + return __cvdso_clock_gettime(clock, ts); } -notrace static int do_hres(clockid_t clk, struct timespec *ts) -{ - struct vgtod_ts *base = >od->basetime[clk]; - u64 cycles, last, sec, ns; - unsigned int seq; - - do { - seq = gtod_read_begin(gtod); - cycles = vgetcyc(gtod->vclock_mode); - ns = base->nsec; - last = gtod->cycle_last; - if (unlikely((s64)cycles < 0)) - return vdso_fallback_gettime(clk, ts); - if (cycles > last) - ns += (cycles - last) * gtod->mult; - ns >>= gtod->shift; - sec = base->sec; - } while (unlikely(gtod_read_retry(gtod, seq))); - - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return 0; -} +int clock_gettime(clockid_t, struct __kernel_timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); -notrace static void do_coarse(clockid_t clk, struct timespec *ts) +int __vdso_clock_getres(clockid_t clock, + struct __kernel_timespec *res) { - struct vgtod_ts *base = >od->basetime[clk]; - unsigned int seq; - - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = base->sec; - ts->tv_nsec = base->nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); + return __cvdso_clock_getres(clock, res); } +int clock_getres(clockid_t, struct __kernel_timespec *) + __attribute__((weak, alias("__vdso_clock_getres"))); -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +#else +/* i386 only */ +extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts); +extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res); + +int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts) { - unsigned int msk; - - /* Sort out negative (CPU/FD) and invalid clocks */ - if (unlikely((unsigned int) clock >= MAX_CLOCKS)) - return vdso_fallback_gettime(clock, ts); - - /* - * Convert the clockid to a bitmask and use it to check which - * clocks are handled in the VDSO directly. - */ - msk = 1U << clock; - if (likely(msk & VGTOD_HRES)) { - return do_hres(clock, ts); - } else if (msk & VGTOD_COARSE) { - do_coarse(clock, ts); - return 0; - } - return vdso_fallback_gettime(clock, ts); + return __cvdso_clock_gettime32(clock, ts); } -int clock_gettime(clockid_t, struct timespec *) +int clock_gettime(clockid_t, struct old_timespec32 *) __attribute__((weak, alias("__vdso_clock_gettime"))); -notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts) { - if (likely(tv != NULL)) { - struct timespec *ts = (struct timespec *) tv; - - do_hres(CLOCK_REALTIME, ts); - tv->tv_usec /= 1000; - } - if (unlikely(tz != NULL)) { - tz->tz_minuteswest = gtod->tz_minuteswest; - tz->tz_dsttime = gtod->tz_dsttime; - } - - return 0; + return __cvdso_clock_gettime(clock, ts); } -int gettimeofday(struct timeval *, struct timezone *) - __attribute__((weak, alias("__vdso_gettimeofday"))); -/* - * This will break when the xtime seconds get inaccurate, but that is - * unlikely - */ -notrace time_t __vdso_time(time_t *t) -{ - /* This is atomic on x86 so we don't need any locks. */ - time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec); +int clock_gettime64(clockid_t, struct __kernel_timespec *) + __attribute__((weak, alias("__vdso_clock_gettime64"))); - if (t) - *t = result; - return result; +int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res) +{ + return __cvdso_clock_getres_time32(clock, res); } -time_t time(time_t *t) - __attribute__((weak, alias("__vdso_time"))); + +int clock_getres(clockid_t, struct old_timespec32 *) + __attribute__((weak, alias("__vdso_clock_getres"))); +#endif diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index d3a2dce4cfa9..36b644e16272 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -25,6 +25,8 @@ VERSION { __vdso_getcpu; time; __vdso_time; + clock_getres; + __vdso_clock_getres; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 422764a81d32..c7720995ab1a 100644 --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -26,6 +26,8 @@ VERSION __vdso_clock_gettime; __vdso_gettimeofday; __vdso_time; + __vdso_clock_getres; + __vdso_clock_gettime64; }; LINUX_2.5 { diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S index 05cd1c5c4a15..16a8050a4fb6 100644 --- a/arch/x86/entry/vdso/vdsox32.lds.S +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -21,6 +21,7 @@ VERSION { __vdso_gettimeofday; __vdso_getcpu; __vdso_time; + __vdso_clock_getres; local: *; }; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 8db1f594e8b1..349a61d8bf34 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -22,7 +22,7 @@ #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> -#include <asm/mshyperv.h> +#include <clocksource/hyperv_timer.h> #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile index 1ac4dd116c26..93c1b3e949a7 100644 --- a/arch/x86/entry/vsyscall/Makefile +++ b/arch/x86/entry/vsyscall/Makefile @@ -2,7 +2,5 @@ # # Makefile for the x86 low level vsyscall code # -obj-y := vsyscall_gtod.o - obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index d9d81ad7a400..e7c596dea947 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -42,9 +42,11 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NONE } vsyscall_mode = +static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; +#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) + XONLY; #else EMULATE; #endif @@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str) if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; + else if (!strcmp("xonly", str)) + vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else @@ -106,14 +110,15 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr, current); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; @@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) long ret; unsigned long orig_dx; + /* Write faults or kernel-privilege faults never get fixed up. */ + if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) + return false; + + if (!(error_code & X86_PF_INSTR)) { + /* Failed vsyscall read */ + if (vsyscall_mode == EMULATE) + return false; + + /* + * User code tried and failed to read the vsyscall page. + */ + warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); + return false; + } + /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. @@ -268,7 +289,7 @@ do_ret: return true; sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return true; } @@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; -static struct vm_area_struct gate_vma = { +static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, @@ -357,12 +378,20 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - if (vsyscall_mode != NONE) { + /* + * For full emulation, the page needs to exist for real. In + * execute-only mode, there is no PTE at all backing the vsyscall + * page. + */ + if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } + if (vsyscall_mode == XONLY) + gate_vma.vm_flags = VM_EXEC; + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c deleted file mode 100644 index cfcdba082feb..000000000000 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold <stefani@seibold.net> - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include <linux/timekeeper_internal.h> -#include <asm/vgtod.h> -#include <asm/vvar.h> - -int vclocks_used __read_mostly; - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - struct vgtod_ts *base; - u64 nsec; - - /* Mark the new vclock used. */ - BUILD_BUG_ON(VCLOCK_MAX >= 32); - WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - base = &vdata->basetime[CLOCK_REALTIME]; - base->sec = tk->xtime_sec; - base->nsec = tk->tkr_mono.xtime_nsec; - - base = &vdata->basetime[CLOCK_TAI]; - base->sec = tk->xtime_sec + (s64)tk->tai_offset; - base->nsec = tk->tkr_mono.xtime_nsec; - - base = &vdata->basetime[CLOCK_MONOTONIC]; - base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); - while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - base->sec++; - } - base->nsec = nsec; - - base = &vdata->basetime[CLOCK_REALTIME_COARSE]; - base->sec = tk->xtime_sec; - base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - base = &vdata->basetime[CLOCK_MONOTONIC_COARSE]; - base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - nsec += tk->wall_to_monotonic.tv_nsec; - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - base->sec++; - } - base->nsec = nsec; - - gtod_write_end(vdata); -} diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9cbfd34042d5..9e07f554333f 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o +obj-y += core.o probe.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3cd94a21bd53..81b005e4c7d9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1618,68 +1618,6 @@ static struct attribute_group x86_pmu_format_group __ro_after_init = { .attrs = NULL, }; -/* - * Remove all undefined events (x86_pmu.event_map(id) == 0) - * out of events_attr attributes. - */ -static void __init filter_events(struct attribute **attrs) -{ - struct device_attribute *d; - struct perf_pmu_events_attr *pmu_attr; - int offset = 0; - int i, j; - - for (i = 0; attrs[i]; i++) { - d = (struct device_attribute *)attrs[i]; - pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); - /* str trumps id */ - if (pmu_attr->event_str) - continue; - if (x86_pmu.event_map(i + offset)) - continue; - - for (j = i; attrs[j]; j++) - attrs[j] = attrs[j + 1]; - - /* Check the shifted attr. */ - i--; - - /* - * event_map() is index based, the attrs array is organized - * by increasing event index. If we shift the events, then - * we need to compensate for the event_map(), otherwise - * we are looking up the wrong event in the map - */ - offset++; - } -} - -/* Merge two pointer arrays */ -__init struct attribute **merge_attr(struct attribute **a, struct attribute **b) -{ - struct attribute **new; - int j, i; - - for (j = 0; a && a[j]; j++) - ; - for (i = 0; b && b[i]; i++) - j++; - j++; - - new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL); - if (!new) - return NULL; - - j = 0; - for (i = 0; a && a[i]; i++) - new[j++] = a[i]; - for (i = 0; b && b[i]; i++) - new[j++] = b[i]; - new[j] = NULL; - - return new; -} - ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = \ @@ -1744,9 +1682,24 @@ static struct attribute *events_attr[] = { NULL, }; +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static umode_t +is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); + /* str trumps id */ + return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; +} + static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, + .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) @@ -1842,37 +1795,10 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; - if (x86_pmu.caps_attrs) { - struct attribute **tmp; - - tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); - if (!WARN_ON(!tmp)) - x86_pmu_caps_group.attrs = tmp; - } - - if (x86_pmu.event_attrs) - x86_pmu_events_group.attrs = x86_pmu.event_attrs; - if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; - else - filter_events(x86_pmu_events_group.attrs); - if (x86_pmu.cpu_events) { - struct attribute **tmp; - - tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); - if (!WARN_ON(!tmp)) - x86_pmu_events_group.attrs = tmp; - } - - if (x86_pmu.attrs) { - struct attribute **tmp; - - tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs); - if (!WARN_ON(!tmp)) - x86_pmu_attr_group.attrs = tmp; - } + pmu.attr_update = x86_pmu.attr_update; pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); @@ -2179,7 +2105,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(&mm->mmap_sem); + lockdep_assert_held_write(&mm->mmap_sem); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a5436cee20b1..bda450ff51ee 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -20,6 +20,7 @@ #include <asm/intel-family.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> +#include <asm/hypervisor.h> #include "../perf_event.h" @@ -3897,8 +3898,6 @@ static __initconst const struct x86_pmu core_pmu = { .check_period = intel_pmu_check_period, }; -static struct attribute *intel_pmu_attrs[]; - static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -3930,8 +3929,6 @@ static __initconst const struct x86_pmu intel_pmu = { .format_attrs = intel_arch3_formats_attr, .events_sysfs_show = intel_event_sysfs_show, - .attrs = intel_pmu_attrs, - .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, @@ -4055,6 +4052,13 @@ static bool check_msr(unsigned long msr, u64 mask) u64 val_old, val_new, val_tmp; /* + * Disable the check for real HW, so we don't + * mess with potentionaly enabled registers: + */ + if (hypervisor_is_type(X86_HYPER_NATIVE)) + return true; + + /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. @@ -4274,13 +4278,6 @@ static struct attribute *icl_tsx_events_attrs[] = { NULL, }; -static __init struct attribute **get_icl_events_attrs(void) -{ - return boot_cpu_has(X86_FEATURE_RTM) ? - merge_attr(icl_events_attrs, icl_tsx_events_attrs) : - icl_events_attrs; -} - static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4402,43 +4399,111 @@ static DEVICE_ATTR(allow_tsx_force_abort, 0644, static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, - NULL, /* &dev_attr_allow_tsx_force_abort.attr.attr */ + &dev_attr_allow_tsx_force_abort.attr, NULL, }; -static __init struct attribute ** -get_events_attrs(struct attribute **base, - struct attribute **mem, - struct attribute **tsx) +static umode_t +tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - struct attribute **attrs = base; - struct attribute **old; + return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0; +} - if (mem && x86_pmu.pebs) - attrs = merge_attr(attrs, mem); +static umode_t +pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.pebs ? attr->mode : 0; +} - if (tsx && boot_cpu_has(X86_FEATURE_RTM)) { - old = attrs; - attrs = merge_attr(attrs, tsx); - if (old != base) - kfree(old); - } +static umode_t +lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} - return attrs; +static umode_t +exra_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.version >= 2 ? attr->mode : 0; } +static umode_t +default_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + if (attr == &dev_attr_allow_tsx_force_abort.attr) + return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0; + + return attr->mode; +} + +static struct attribute_group group_events_td = { + .name = "events", +}; + +static struct attribute_group group_events_mem = { + .name = "events", + .is_visible = pebs_is_visible, +}; + +static struct attribute_group group_events_tsx = { + .name = "events", + .is_visible = tsx_is_visible, +}; + +static struct attribute_group group_caps_gen = { + .name = "caps", + .attrs = intel_pmu_caps_attrs, +}; + +static struct attribute_group group_caps_lbr = { + .name = "caps", + .attrs = lbr_attrs, + .is_visible = lbr_is_visible, +}; + +static struct attribute_group group_format_extra = { + .name = "format", + .is_visible = exra_is_visible, +}; + +static struct attribute_group group_format_extra_skl = { + .name = "format", + .is_visible = exra_is_visible, +}; + +static struct attribute_group group_default = { + .attrs = intel_pmu_attrs, + .is_visible = default_is_visible, +}; + +static const struct attribute_group *attr_update[] = { + &group_events_td, + &group_events_mem, + &group_events_tsx, + &group_caps_gen, + &group_caps_lbr, + &group_format_extra, + &group_format_extra_skl, + &group_default, + NULL, +}; + +static struct attribute *empty_attrs; + __init int intel_pmu_init(void) { - struct attribute **extra_attr = NULL; - struct attribute **mem_attr = NULL; - struct attribute **tsx_attr = NULL; - struct attribute **to_free = NULL; + struct attribute **extra_skl_attr = &empty_attrs; + struct attribute **extra_attr = &empty_attrs; + struct attribute **td_attr = &empty_attrs; + struct attribute **mem_attr = &empty_attrs; + struct attribute **tsx_attr = &empty_attrs; union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; struct event_constraint *c; unsigned int unused; struct extra_reg *er; + bool pmem = false; int version, i; char *name; @@ -4596,7 +4661,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.cpu_events = slm_events_attrs; + td_attr = slm_events_attrs; extra_attr = slm_format_attr; pr_cont("Silvermont events, "); name = "silvermont"; @@ -4624,7 +4689,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.cpu_events = glm_events_attrs; + td_attr = glm_events_attrs; extra_attr = slm_format_attr; pr_cont("Goldmont events, "); name = "goldmont"; @@ -4651,7 +4716,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; - x86_pmu.cpu_events = glm_events_attrs; + td_attr = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ event_attr_td_total_slots_scale_glm.event_str = "4"; extra_attr = slm_format_attr; @@ -4740,7 +4805,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.cpu_events = snb_events_attrs; + td_attr = snb_events_attrs; mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ @@ -4781,7 +4846,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.cpu_events = snb_events_attrs; + td_attr = snb_events_attrs; mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ @@ -4818,10 +4883,10 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + td_attr = hsw_events_attrs; mem_attr = hsw_mem_events_attrs; tsx_attr = hsw_tsx_events_attrs; pr_cont("Haswell events, "); @@ -4860,10 +4925,10 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + td_attr = hsw_events_attrs; mem_attr = hsw_mem_events_attrs; tsx_attr = hsw_tsx_events_attrs; pr_cont("Broadwell events, "); @@ -4890,9 +4955,10 @@ __init int intel_pmu_init(void) name = "knights-landing"; break; + case INTEL_FAM6_SKYLAKE_X: + pmem = true; case INTEL_FAM6_SKYLAKE_MOBILE: case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_MOBILE: case INTEL_FAM6_KABYLAKE_DESKTOP: x86_add_quirk(intel_pebs_isolation_quirk); @@ -4920,27 +4986,28 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; - extra_attr = merge_attr(extra_attr, skl_format_attr); - to_free = extra_attr; - x86_pmu.cpu_events = hsw_events_attrs; + extra_skl_attr = skl_format_attr; + td_attr = hsw_events_attrs; mem_attr = hsw_mem_events_attrs; tsx_attr = hsw_tsx_events_attrs; - intel_pmu_pebs_data_source_skl( - boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); + intel_pmu_pebs_data_source_skl(pmem); if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { x86_pmu.flags |= PMU_FL_TFA; x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; x86_pmu.commit_scheduling = intel_tfa_commit_scheduling; - intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr; } pr_cont("Skylake events, "); name = "skylake"; break; + case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_XEON_D: + pmem = true; case INTEL_FAM6_ICELAKE_MOBILE: + case INTEL_FAM6_ICELAKE_DESKTOP: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -4959,11 +5026,12 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = icl_get_event_constraints; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; - extra_attr = merge_attr(extra_attr, skl_format_attr); - x86_pmu.cpu_events = get_icl_events_attrs(); + extra_skl_attr = skl_format_attr; + mem_attr = icl_events_attrs; + tsx_attr = icl_tsx_events_attrs; x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); x86_pmu.lbr_pt_coexist = true; - intel_pmu_pebs_data_source_skl(false); + intel_pmu_pebs_data_source_skl(pmem); pr_cont("Icelake events, "); name = "icelake"; break; @@ -4988,14 +5056,14 @@ __init int intel_pmu_init(void) snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name); - if (version >= 2 && extra_attr) { - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - extra_attr); - WARN_ON(!x86_pmu.format_attrs); - } - x86_pmu.cpu_events = get_events_attrs(x86_pmu.cpu_events, - mem_attr, tsx_attr); + group_events_td.attrs = td_attr; + group_events_mem.attrs = mem_attr; + group_events_tsx.attrs = tsx_attr; + group_format_extra.attrs = extra_attr; + group_format_extra_skl.attrs = extra_skl_attr; + + x86_pmu.attr_update = attr_update; if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -5043,12 +5111,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - x86_pmu.caps_attrs = intel_pmu_caps_attrs; - - if (x86_pmu.lbr_nr) { - x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); + if (x86_pmu.lbr_nr) pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); - } /* * Access extra MSR may cause #GP under certain circumstances. @@ -5078,7 +5142,6 @@ __init int intel_pmu_init(void) if (x86_pmu.counter_freezing) x86_pmu.handle_irq = intel_pmu_handle_irq_v4; - kfree(to_free); return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 6072f92cb8ea..688592b34564 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -96,6 +96,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "../perf_event.h" +#include "../probe.h" MODULE_LICENSE("GPL"); @@ -144,25 +145,42 @@ enum perf_cstate_core_events { PERF_CSTATE_CORE_EVENT_MAX, }; -PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); -PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); -PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); -PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c1-residency, attr_cstate_core_c1, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, attr_cstate_core_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_core_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, attr_cstate_core_c7, "event=0x03"); -static struct perf_cstate_msr core_msr[] = { - [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 }, - [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 }, - [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 }, - [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 }, +static unsigned long core_msr_mask; + +PMU_EVENT_GROUP(events, cstate_core_c1); +PMU_EVENT_GROUP(events, cstate_core_c3); +PMU_EVENT_GROUP(events, cstate_core_c6); +PMU_EVENT_GROUP(events, cstate_core_c7); + +static bool test_msr(int idx, void *data) +{ + return test_bit(idx, (unsigned long *) data); +} + +static struct perf_msr core_msr[] = { + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &group_cstate_core_c1, test_msr }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &group_cstate_core_c3, test_msr }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &group_cstate_core_c6, test_msr }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &group_cstate_core_c7, test_msr }, }; -static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { +static struct attribute *attrs_empty[] = { NULL, }; +/* + * There are no default events, but we need to create + * "events" group (with empty attrs) before updating + * it with detected events. + */ static struct attribute_group core_events_attr_group = { .name = "events", - .attrs = core_events_attrs, + .attrs = attrs_empty, }; DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); @@ -211,31 +229,37 @@ enum perf_cstate_pkg_events { PERF_CSTATE_PKG_EVENT_MAX, }; -PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); -PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); -PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); -PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03"); -PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04"); -PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); -PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); - -static struct perf_cstate_msr pkg_msr[] = { - [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 }, - [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 }, - [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 }, - [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 }, - [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 }, - [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 }, - [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 }, -}; - -static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { - NULL, +PMU_EVENT_ATTR_STRING(c2-residency, attr_cstate_pkg_c2, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, attr_cstate_pkg_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_pkg_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, attr_cstate_pkg_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c8-residency, attr_cstate_pkg_c8, "event=0x04"); +PMU_EVENT_ATTR_STRING(c9-residency, attr_cstate_pkg_c9, "event=0x05"); +PMU_EVENT_ATTR_STRING(c10-residency, attr_cstate_pkg_c10, "event=0x06"); + +static unsigned long pkg_msr_mask; + +PMU_EVENT_GROUP(events, cstate_pkg_c2); +PMU_EVENT_GROUP(events, cstate_pkg_c3); +PMU_EVENT_GROUP(events, cstate_pkg_c6); +PMU_EVENT_GROUP(events, cstate_pkg_c7); +PMU_EVENT_GROUP(events, cstate_pkg_c8); +PMU_EVENT_GROUP(events, cstate_pkg_c9); +PMU_EVENT_GROUP(events, cstate_pkg_c10); + +static struct perf_msr pkg_msr[] = { + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &group_cstate_pkg_c2, test_msr }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &group_cstate_pkg_c3, test_msr }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &group_cstate_pkg_c6, test_msr }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &group_cstate_pkg_c7, test_msr }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &group_cstate_pkg_c8, test_msr }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &group_cstate_pkg_c9, test_msr }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr }, }; static struct attribute_group pkg_events_attr_group = { .name = "events", - .attrs = pkg_events_attrs, + .attrs = attrs_empty, }; DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); @@ -289,7 +313,8 @@ static int cstate_pmu_event_init(struct perf_event *event) if (event->pmu == &cstate_core_pmu) { if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) return -EINVAL; - if (!core_msr[cfg].attr) + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_CORE_EVENT_MAX); + if (!(core_msr_mask & (1 << cfg))) return -EINVAL; event->hw.event_base = core_msr[cfg].msr; cpu = cpumask_any_and(&cstate_core_cpu_mask, @@ -298,11 +323,11 @@ static int cstate_pmu_event_init(struct perf_event *event) if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) return -EINVAL; cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); - if (!pkg_msr[cfg].attr) + if (!(pkg_msr_mask & (1 << cfg))) return -EINVAL; event->hw.event_base = pkg_msr[cfg].msr; cpu = cpumask_any_and(&cstate_pkg_cpu_mask, - topology_core_cpumask(event->cpu)); + topology_die_cpumask(event->cpu)); } else { return -ENOENT; } @@ -385,7 +410,7 @@ static int cstate_cpu_exit(unsigned int cpu) if (has_cstate_pkg && cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); /* Migrate events if there is a valid target */ if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_pkg_cpu_mask); @@ -414,15 +439,35 @@ static int cstate_cpu_init(unsigned int cpu) * in the package cpu mask as the designated reader. */ target = cpumask_any_and(&cstate_pkg_cpu_mask, - topology_core_cpumask(cpu)); + topology_die_cpumask(cpu)); if (has_cstate_pkg && target >= nr_cpu_ids) cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); return 0; } +const struct attribute_group *core_attr_update[] = { + &group_cstate_core_c1, + &group_cstate_core_c3, + &group_cstate_core_c6, + &group_cstate_core_c7, + NULL, +}; + +const struct attribute_group *pkg_attr_update[] = { + &group_cstate_pkg_c2, + &group_cstate_pkg_c3, + &group_cstate_pkg_c6, + &group_cstate_pkg_c7, + &group_cstate_pkg_c8, + &group_cstate_pkg_c9, + &group_cstate_pkg_c10, + NULL, +}; + static struct pmu cstate_core_pmu = { .attr_groups = core_attr_groups, + .attr_update = core_attr_update, .name = "cstate_core", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, @@ -437,6 +482,7 @@ static struct pmu cstate_core_pmu = { static struct pmu cstate_pkg_pmu = { .attr_groups = pkg_attr_groups, + .attr_update = pkg_attr_update, .name = "cstate_pkg", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, @@ -580,35 +626,11 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_DESKTOP, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -/* - * Probe the cstate events and insert the available one into sysfs attrs - * Return false if there are no available events. - */ -static bool __init cstate_probe_msr(const unsigned long evmsk, int max, - struct perf_cstate_msr *msr, - struct attribute **attrs) -{ - bool found = false; - unsigned int bit; - u64 val; - - for (bit = 0; bit < max; bit++) { - if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) { - *attrs++ = &msr[bit].attr->attr.attr; - found = true; - } else { - msr[bit].attr = NULL; - } - } - *attrs = NULL; - - return found; -} - static int __init cstate_probe(const struct cstate_model *cm) { /* SLM has different MSR for PKG C6 */ @@ -620,13 +642,14 @@ static int __init cstate_probe(const struct cstate_model *cm) pkg_msr[PERF_CSTATE_CORE_C6_RES].msr = MSR_KNL_CORE_C6_RESIDENCY; - has_cstate_core = cstate_probe_msr(cm->core_events, - PERF_CSTATE_CORE_EVENT_MAX, - core_msr, core_events_attrs); + core_msr_mask = perf_msr_probe(core_msr, PERF_CSTATE_CORE_EVENT_MAX, + true, (void *) &cm->core_events); - has_cstate_pkg = cstate_probe_msr(cm->pkg_events, - PERF_CSTATE_PKG_EVENT_MAX, - pkg_msr, pkg_events_attrs); + pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX, + true, (void *) &cm->pkg_events); + + has_cstate_core = !!core_msr_mask; + has_cstate_pkg = !!pkg_msr_mask; return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; } @@ -663,7 +686,13 @@ static int __init cstate_init(void) } if (has_cstate_pkg) { - err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); + if (topology_max_die_per_package() > 1) { + err = perf_pmu_register(&cstate_pkg_pmu, + "cstate_die", -1); + } else { + err = perf_pmu_register(&cstate_pkg_pmu, + cstate_pkg_pmu.name, -1); + } if (err) { has_cstate_pkg = false; pr_info("Failed to register cstate pkg pmu\n"); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 505c73dc6a73..2c8db2c19328 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -337,7 +337,7 @@ static int alloc_pebs_buffer(int cpu) struct debug_store *ds = hwev->ds; size_t bsiz = x86_pmu.pebs_buffer_size; int max, node = cpu_to_node(cpu); - void *buffer, *ibuffer, *cea; + void *buffer, *insn_buff, *cea; if (!x86_pmu.pebs) return 0; @@ -351,12 +351,12 @@ static int alloc_pebs_buffer(int cpu) * buffer then. */ if (x86_pmu.intel_cap.pebs_format < 2) { - ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); - if (!ibuffer) { + insn_buff = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!insn_buff) { dsfree_pages(buffer, bsiz); return -ENOMEM; } - per_cpu(insn_buffer, cpu) = ibuffer; + per_cpu(insn_buffer, cpu) = insn_buff; } hwev->ds_pebs_vaddr = buffer; /* Update the cpu entry area mapping */ diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 26c03f5adfb9..64ab51ffdf06 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -55,27 +55,28 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/perf_event.h> +#include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "../perf_event.h" +#include "../probe.h" MODULE_LICENSE("GPL"); /* * RAPL energy status counters */ -#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ -#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ -#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ -#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ -#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ -#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ -#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ -#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */ -#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */ - -#define NR_RAPL_DOMAINS 0x5 +enum perf_rapl_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + + PERF_RAPL_MAX, + NR_RAPL_DOMAINS = PERF_RAPL_MAX, +}; + static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "pp0-core", "package", @@ -84,33 +85,6 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "psys", }; -/* Clients have PP0, PKG */ -#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT) - -/* Servers have PP0, PKG, RAM */ -#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT) - -/* Servers have PP0, PKG, RAM, PP1 */ -#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT) - -/* SKL clients have PP0, PKG, RAM, PP1, PSYS */ -#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT|\ - 1<<RAPL_IDX_PSYS_NRG_STAT) - -/* Knights Landing has PKG, RAM */ -#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT) - /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -149,26 +123,32 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; - unsigned int maxpkg; + unsigned int maxdie; struct rapl_pmu *pmus[]; }; +struct rapl_model { + unsigned long events; + bool apply_quirk; +}; + /* 1/2^hw_unit Joule */ static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; +static struct perf_msr rapl_msrs[]; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - unsigned int pkgid = topology_logical_package_id(cpu); + unsigned int dieid = topology_logical_die_id(cpu); /* * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; + return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -350,7 +330,7 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags) static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; - int bit, msr, ret = 0; + int bit, ret = 0; struct rapl_pmu *pmu; /* only look at RAPL events */ @@ -366,33 +346,12 @@ static int rapl_pmu_event_init(struct perf_event *event) event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; - /* - * check event is known (determines counter) - */ - switch (cfg) { - case INTEL_RAPL_PP0: - bit = RAPL_IDX_PP0_NRG_STAT; - msr = MSR_PP0_ENERGY_STATUS; - break; - case INTEL_RAPL_PKG: - bit = RAPL_IDX_PKG_NRG_STAT; - msr = MSR_PKG_ENERGY_STATUS; - break; - case INTEL_RAPL_RAM: - bit = RAPL_IDX_RAM_NRG_STAT; - msr = MSR_DRAM_ENERGY_STATUS; - break; - case INTEL_RAPL_PP1: - bit = RAPL_IDX_PP1_NRG_STAT; - msr = MSR_PP1_ENERGY_STATUS; - break; - case INTEL_RAPL_PSYS: - bit = RAPL_IDX_PSYS_NRG_STAT; - msr = MSR_PLATFORM_ENERGY_STATUS; - break; - default: + if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) return -EINVAL; - } + + cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); + bit = cfg - 1; + /* check event supported */ if (!(rapl_cntr_mask & (1 << bit))) return -EINVAL; @@ -407,7 +366,7 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; - event->hw.event_base = msr; + event->hw.event_base = rapl_msrs[bit].msr; event->hw.config = cfg; event->hw.idx = bit; @@ -457,110 +416,111 @@ RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); -static struct attribute *rapl_events_srv_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_ram), +/* + * There are no default events, but we need to create + * "events" group (with empty attrs) before updating + * it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_ram_unit), +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_ram_scale), +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, NULL, }; -static struct attribute *rapl_events_cln_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), +static const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, NULL, }; -static struct attribute *rapl_events_hsw_attr[] = { +static struct attribute *rapl_events_cores[] = { EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - EVENT_PTR(rapl_ram), - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), - EVENT_PTR(rapl_ram_unit), - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), - EVENT_PTR(rapl_ram_scale), NULL, }; -static struct attribute *rapl_events_skl_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - EVENT_PTR(rapl_ram), - EVENT_PTR(rapl_psys), +static struct attribute_group rapl_events_cores_group = { + .name = "events", + .attrs = rapl_events_cores, +}; - EVENT_PTR(rapl_cores_unit), +static struct attribute *rapl_events_pkg[] = { + EVENT_PTR(rapl_pkg), EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), - EVENT_PTR(rapl_ram_unit), - EVENT_PTR(rapl_psys_unit), - - EVENT_PTR(rapl_cores_scale), EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), - EVENT_PTR(rapl_ram_scale), - EVENT_PTR(rapl_psys_scale), NULL, }; -static struct attribute *rapl_events_knl_attr[] = { - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_ram), +static struct attribute_group rapl_events_pkg_group = { + .name = "events", + .attrs = rapl_events_pkg, +}; - EVENT_PTR(rapl_pkg_unit), +static struct attribute *rapl_events_ram[] = { + EVENT_PTR(rapl_ram), EVENT_PTR(rapl_ram_unit), - - EVENT_PTR(rapl_pkg_scale), EVENT_PTR(rapl_ram_scale), NULL, }; -static struct attribute_group rapl_pmu_events_group = { - .name = "events", - .attrs = NULL, /* patched at runtime */ +static struct attribute_group rapl_events_ram_group = { + .name = "events", + .attrs = rapl_events_ram, }; -DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); -static struct attribute *rapl_formats_attr[] = { - &format_attr_event.attr, +static struct attribute *rapl_events_gpu[] = { + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_gpu_scale), NULL, }; -static struct attribute_group rapl_pmu_format_group = { - .name = "format", - .attrs = rapl_formats_attr, +static struct attribute_group rapl_events_gpu_group = { + .name = "events", + .attrs = rapl_events_gpu, }; -static const struct attribute_group *rapl_attr_groups[] = { - &rapl_pmu_attr_group, - &rapl_pmu_format_group, - &rapl_pmu_events_group, +static struct attribute *rapl_events_psys[] = { + EVENT_PTR(rapl_psys), + EVENT_PTR(rapl_psys_unit), + EVENT_PTR(rapl_psys_scale), NULL, }; +static struct attribute_group rapl_events_psys_group = { + .name = "events", + .attrs = rapl_events_psys, +}; + +static bool test_msr(int idx, void *data) +{ + return test_bit(idx, (unsigned long *) data); +} + +static struct perf_msr rapl_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, +}; + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -572,7 +532,7 @@ static int rapl_cpu_offline(unsigned int cpu) pmu->cpu = -1; /* Find a new cpu to collect rapl events */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); /* Migrate rapl events to the new target */ if (target < nr_cpu_ids) { @@ -599,14 +559,14 @@ static int rapl_cpu_online(unsigned int cpu) pmu->timer_interval = ms_to_ktime(rapl_timer_ms); rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; } /* * Check if there is an online cpu in the package which collects rapl * events already. */ - target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu)); + target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); if (target < nr_cpu_ids) return 0; @@ -633,7 +593,7 @@ static int rapl_check_hw_unit(bool apply_quirk) * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ if (apply_quirk) - rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; + rapl_hw_unit[PERF_RAPL_RAM] = 16; /* * Calculate the timer rate: @@ -669,23 +629,33 @@ static void cleanup_rapl_pmus(void) { int i; - for (i = 0; i < rapl_pmus->maxpkg; i++) + for (i = 0; i < rapl_pmus->maxdie; i++) kfree(rapl_pmus->pmus[i]); kfree(rapl_pmus); } +const struct attribute_group *rapl_attr_update[] = { + &rapl_events_cores_group, + &rapl_events_pkg_group, + &rapl_events_ram_group, + &rapl_events_gpu_group, + &rapl_events_gpu_group, + NULL, +}; + static int __init init_rapl_pmus(void) { - int maxpkg = topology_max_packages(); + int maxdie = topology_max_packages() * topology_max_die_per_package(); size_t size; - size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *); + size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); rapl_pmus = kzalloc(size, GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; - rapl_pmus->maxpkg = maxpkg; + rapl_pmus->maxdie = maxdie; rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.attr_update = rapl_attr_update; rapl_pmus->pmu.task_ctx_nr = perf_invalid_context; rapl_pmus->pmu.event_init = rapl_pmu_event_init; rapl_pmus->pmu.add = rapl_pmu_event_add; @@ -701,105 +671,96 @@ static int __init init_rapl_pmus(void) #define X86_RAPL_MODEL_MATCH(model, init) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } -struct intel_rapl_init_fun { - bool apply_quirk; - int cntr_mask; - struct attribute **attrs; -}; - -static const struct intel_rapl_init_fun snb_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_CLN, - .attrs = rapl_events_cln_attr, +static struct rapl_model model_snb = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .apply_quirk = false, }; -static const struct intel_rapl_init_fun hsx_rapl_init __initconst = { - .apply_quirk = true, - .cntr_mask = RAPL_IDX_SRV, - .attrs = rapl_events_srv_attr, +static struct rapl_model model_snbep = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = false, }; -static const struct intel_rapl_init_fun hsw_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_HSW, - .attrs = rapl_events_hsw_attr, +static struct rapl_model model_hsw = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .apply_quirk = false, }; -static const struct intel_rapl_init_fun snbep_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_SRV, - .attrs = rapl_events_srv_attr, +static struct rapl_model model_hsx = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = true, }; -static const struct intel_rapl_init_fun knl_rapl_init __initconst = { - .apply_quirk = true, - .cntr_mask = RAPL_IDX_KNL, - .attrs = rapl_events_knl_attr, +static struct rapl_model model_knl = { + .events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = true, }; -static const struct intel_rapl_init_fun skl_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_SKL_CLN, - .attrs = rapl_events_skl_attr, +static struct rapl_model model_skl = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .apply_quirk = false, }; -static const struct x86_cpu_id rapl_cpu_match[] __initconst = { - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, snb_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, skl_rapl_init), +static const struct x86_cpu_id rapl_model_match[] __initconst = { + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, model_snb), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, model_skl), {}, }; -MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match); +MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; - struct intel_rapl_init_fun *rapl_init; - bool apply_quirk; + struct rapl_model *rm; int ret; - id = x86_match_cpu(rapl_cpu_match); + id = x86_match_cpu(rapl_model_match); if (!id) return -ENODEV; - rapl_init = (struct intel_rapl_init_fun *)id->driver_data; - apply_quirk = rapl_init->apply_quirk; - rapl_cntr_mask = rapl_init->cntr_mask; - rapl_pmu_events_group.attrs = rapl_init->attrs; + rm = (struct rapl_model *) id->driver_data; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, + false, (void *) &rm->events); - ret = rapl_check_hw_unit(apply_quirk); + ret = rapl_check_hw_unit(rm->apply_quirk); if (ret) return ret; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 9e3fbd47cb56..3694a5d0703d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -8,6 +8,7 @@ static struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; struct intel_uncore_type **uncore_pci_uncores = empty_uncore; +struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; @@ -15,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_packages; +static int max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -28,7 +29,7 @@ struct event_constraint uncore_constraint_empty = MODULE_LICENSE("GPL"); -static int uncore_pcibus_to_physid(struct pci_bus *bus) +int uncore_pcibus_to_physid(struct pci_bus *bus) { struct pci2phy_map *map; int phys_id = -1; @@ -101,13 +102,13 @@ ssize_t uncore_event_show(struct kobject *kobj, struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - unsigned int pkgid = topology_logical_package_id(cpu); + unsigned int dieid = topology_logical_die_id(cpu); /* * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return pkgid < max_packages ? pmu->boxes[pkgid] : NULL; + return dieid < max_dies ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -119,6 +120,21 @@ u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *eve return count; } +void uncore_mmio_exit_box(struct intel_uncore_box *box) +{ + if (box->io_addr) + iounmap(box->io_addr); +} + +u64 uncore_mmio_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (!box->io_addr) + return 0; + + return readq(box->io_addr + event->hw.event_base); +} + /* * generic get constraint function for shared match/mask registers. */ @@ -312,7 +328,7 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, uncore_pmu_init_hrtimer(box); box->cpu = -1; box->pci_phys_id = -1; - box->pkgid = -1; + box->dieid = -1; /* set default hrtimer timeout */ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; @@ -827,10 +843,10 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { - int pkg; + int die; - for (pkg = 0; pkg < max_packages; pkg++) - kfree(pmu->boxes[pkg]); + for (die = 0; die < max_dies; die++) + kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -867,7 +883,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_packages * sizeof(struct intel_uncore_box *); + size = max_dies * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -937,20 +953,21 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id struct intel_uncore_type *type; struct intel_uncore_pmu *pmu = NULL; struct intel_uncore_box *box; - int phys_id, pkg, ret; + int phys_id, die, ret; phys_id = uncore_pcibus_to_physid(pdev->bus); if (phys_id < 0) return -ENODEV; - pkg = topology_phys_to_logical_pkg(phys_id); - if (pkg < 0) + die = (topology_max_die_per_package() > 1) ? phys_id : + topology_phys_to_logical_pkg(phys_id); + if (die < 0) return -EINVAL; if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { int idx = UNCORE_PCI_DEV_IDX(id->driver_data); - uncore_extra_pci_dev[pkg].dev[idx] = pdev; + uncore_extra_pci_dev[die].dev[idx] = pdev; pci_set_drvdata(pdev, NULL); return 0; } @@ -989,7 +1006,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; } - if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL)) + if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) return -EINVAL; box = uncore_alloc_box(type, NUMA_NO_NODE); @@ -1003,13 +1020,13 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id atomic_inc(&box->refcnt); box->pci_phys_id = phys_id; - box->pkgid = pkg; + box->dieid = die; box->pci_dev = pdev; box->pmu = pmu; uncore_box_init(box); pci_set_drvdata(pdev, box); - pmu->boxes[pkg] = box; + pmu->boxes[die] = box; if (atomic_inc_return(&pmu->activeboxes) > 1) return 0; @@ -1017,7 +1034,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id ret = uncore_pmu_register(pmu); if (ret) { pci_set_drvdata(pdev, NULL); - pmu->boxes[pkg] = NULL; + pmu->boxes[die] = NULL; uncore_box_exit(box); kfree(box); } @@ -1028,16 +1045,17 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; - int i, phys_id, pkg; + int i, phys_id, die; phys_id = uncore_pcibus_to_physid(pdev->bus); box = pci_get_drvdata(pdev); if (!box) { - pkg = topology_phys_to_logical_pkg(phys_id); + die = (topology_max_die_per_package() > 1) ? phys_id : + topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { - if (uncore_extra_pci_dev[pkg].dev[i] == pdev) { - uncore_extra_pci_dev[pkg].dev[i] = NULL; + if (uncore_extra_pci_dev[die].dev[i] == pdev) { + uncore_extra_pci_dev[die].dev[i] = NULL; break; } } @@ -1050,7 +1068,7 @@ static void uncore_pci_remove(struct pci_dev *pdev) return; pci_set_drvdata(pdev, NULL); - pmu->boxes[box->pkgid] = NULL; + pmu->boxes[box->dieid] = NULL; if (atomic_dec_return(&pmu->activeboxes) == 0) uncore_pmu_unregister(pmu); uncore_box_exit(box); @@ -1062,7 +1080,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_packages * sizeof(struct pci_extra_dev); + size = max_dies * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1109,11 +1127,11 @@ static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, { struct intel_uncore_pmu *pmu = type->pmus; struct intel_uncore_box *box; - int i, pkg; + int i, die; - pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu); + die = topology_logical_die_id(old_cpu < 0 ? new_cpu : old_cpu); for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; + box = pmu->boxes[die]; if (!box) continue; @@ -1141,18 +1159,33 @@ static void uncore_change_context(struct intel_uncore_type **uncores, uncore_change_type_ctx(*uncores, old_cpu, new_cpu); } -static int uncore_event_cpu_offline(unsigned int cpu) +static void uncore_box_unref(struct intel_uncore_type **types, int id) { - struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, pkg, target; + int i; + + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[id]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); + } + } +} + +static int uncore_event_cpu_offline(unsigned int cpu) +{ + int die, target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) goto unref; /* Find a new cpu to collect uncore events */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); /* Migrate uncore events to the new target */ if (target < nr_cpu_ids) @@ -1161,25 +1194,19 @@ static int uncore_event_cpu_offline(unsigned int cpu) target = -1; uncore_change_context(uncore_msr_uncores, cpu, target); + uncore_change_context(uncore_mmio_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); unref: /* Clear the references */ - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box && atomic_dec_return(&box->refcnt) == 0) - uncore_box_exit(box); - } - } + die = topology_logical_die_id(cpu); + uncore_box_unref(uncore_msr_uncores, die); + uncore_box_unref(uncore_mmio_uncores, die); return 0; } static int allocate_boxes(struct intel_uncore_type **types, - unsigned int pkg, unsigned int cpu) + unsigned int die, unsigned int cpu) { struct intel_uncore_box *box, *tmp; struct intel_uncore_type *type; @@ -1192,20 +1219,20 @@ static int allocate_boxes(struct intel_uncore_type **types, type = *types; pmu = type->pmus; for (i = 0; i < type->num_boxes; i++, pmu++) { - if (pmu->boxes[pkg]) + if (pmu->boxes[die]) continue; box = uncore_alloc_box(type, cpu_to_node(cpu)); if (!box) goto cleanup; box->pmu = pmu; - box->pkgid = pkg; + box->dieid = die; list_add(&box->active_list, &allocated); } } /* Install them in the pmus */ list_for_each_entry_safe(box, tmp, &allocated, active_list) { list_del_init(&box->active_list); - box->pmu->boxes[pkg] = box; + box->pmu->boxes[die] = box; } return 0; @@ -1217,15 +1244,15 @@ cleanup: return -ENOMEM; } -static int uncore_event_cpu_online(unsigned int cpu) +static int uncore_box_ref(struct intel_uncore_type **types, + int id, unsigned int cpu) { - struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, ret, pkg, target; + int i, ret; - pkg = topology_logical_package_id(cpu); - ret = allocate_boxes(types, pkg, cpu); + ret = allocate_boxes(types, id, cpu); if (ret) return ret; @@ -1233,23 +1260,38 @@ static int uncore_event_cpu_online(unsigned int cpu) type = *types; pmu = type->pmus; for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; + box = pmu->boxes[id]; if (box && atomic_inc_return(&box->refcnt) == 1) uncore_box_init(box); } } + return 0; +} + +static int uncore_event_cpu_online(unsigned int cpu) +{ + int die, target, msr_ret, mmio_ret; + + die = topology_logical_die_id(cpu); + msr_ret = uncore_box_ref(uncore_msr_uncores, die, cpu); + mmio_ret = uncore_box_ref(uncore_mmio_uncores, die, cpu); + if (msr_ret && mmio_ret) + return -ENOMEM; /* * Check if there is an online cpu in the package * which collects uncore events already. */ - target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu)); + target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu)); if (target < nr_cpu_ids) return 0; cpumask_set_cpu(cpu, &uncore_cpu_mask); - uncore_change_context(uncore_msr_uncores, -1, cpu); + if (!msr_ret) + uncore_change_context(uncore_msr_uncores, -1, cpu); + if (!mmio_ret) + uncore_change_context(uncore_mmio_uncores, -1, cpu); uncore_change_context(uncore_pci_uncores, -1, cpu); return 0; } @@ -1297,12 +1339,35 @@ err: return ret; } +static int __init uncore_mmio_init(void) +{ + struct intel_uncore_type **types = uncore_mmio_uncores; + int ret; + + ret = uncore_types_init(types, true); + if (ret) + goto err; + + for (; *types; types++) { + ret = type_pmu_register(*types); + if (ret) + goto err; + } + return 0; +err: + uncore_types_exit(uncore_mmio_uncores); + uncore_mmio_uncores = empty_uncore; + return ret; +} + + #define X86_UNCORE_MODEL_MATCH(model, init) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } struct intel_uncore_init_fun { void (*cpu_init)(void); int (*pci_init)(void); + void (*mmio_init)(void); }; static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { @@ -1373,6 +1438,12 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun snr_uncore_init __initconst = { + .cpu_init = snr_uncore_cpu_init, + .pci_init = snr_uncore_pci_init, + .mmio_init = snr_uncore_mmio_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1400,6 +1471,9 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_X, snr_uncore_init), {}, }; @@ -1409,7 +1483,7 @@ static int __init intel_uncore_init(void) { const struct x86_cpu_id *id; struct intel_uncore_init_fun *uncore_init; - int pret = 0, cret = 0, ret; + int pret = 0, cret = 0, mret = 0, ret; id = x86_match_cpu(intel_uncore_match); if (!id) @@ -1418,7 +1492,7 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_packages = topology_max_packages(); + max_dies = topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { @@ -1432,7 +1506,12 @@ static int __init intel_uncore_init(void) cret = uncore_cpu_init(); } - if (cret && pret) + if (uncore_init->mmio_init) { + uncore_init->mmio_init(); + mret = uncore_mmio_init(); + } + + if (cret && pret && mret) return -ENODEV; /* Install hotplug callbacks to setup the targets for each package */ @@ -1446,6 +1525,7 @@ static int __init intel_uncore_init(void) err: uncore_types_exit(uncore_msr_uncores); + uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); return ret; } @@ -1455,6 +1535,7 @@ static void __exit intel_uncore_exit(void) { cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE); uncore_types_exit(uncore_msr_uncores); + uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); } module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 79eb2e21e4f0..f36f7bebbc1b 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -2,6 +2,7 @@ #include <linux/slab.h> #include <linux/pci.h> #include <asm/apicdef.h> +#include <linux/io-64-nonatomic-lo-hi.h> #include <linux/perf_event.h> #include "../perf_event.h" @@ -56,7 +57,10 @@ struct intel_uncore_type { unsigned fixed_ctr; unsigned fixed_ctl; unsigned box_ctl; - unsigned msr_offset; + union { + unsigned msr_offset; + unsigned mmio_offset; + }; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; @@ -108,7 +112,7 @@ struct intel_uncore_extra_reg { struct intel_uncore_box { int pci_phys_id; - int pkgid; /* Logical package ID */ + int dieid; /* Logical die ID */ int n_active; /* number of active events */ int n_events; int cpu; /* cpu to collect events */ @@ -125,7 +129,7 @@ struct intel_uncore_box { struct hrtimer hrtimer; struct list_head list; struct list_head active_list; - void *io_addr; + void __iomem *io_addr; struct intel_uncore_extra_reg shared_regs[0]; }; @@ -159,6 +163,7 @@ struct pci2phy_map { }; struct pci2phy_map *__find_pci2phy_map(int segment); +int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -190,6 +195,13 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline +unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl + + box->pmu->type->mmio_offset * box->pmu->pmu_idx; +} + static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) { return box->pmu->type->box_ctl; @@ -330,7 +342,7 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) static inline unsigned uncore_fixed_ctl(struct intel_uncore_box *box) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_fixed_ctl(box); else return uncore_msr_fixed_ctl(box); @@ -339,7 +351,7 @@ unsigned uncore_fixed_ctl(struct intel_uncore_box *box) static inline unsigned uncore_fixed_ctr(struct intel_uncore_box *box) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_fixed_ctr(box); else return uncore_msr_fixed_ctr(box); @@ -348,7 +360,7 @@ unsigned uncore_fixed_ctr(struct intel_uncore_box *box) static inline unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_event_ctl(box, idx); else return uncore_msr_event_ctl(box, idx); @@ -357,7 +369,7 @@ unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) static inline unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_perf_ctr(box, idx); else return uncore_msr_perf_ctr(box, idx); @@ -419,6 +431,16 @@ static inline bool is_freerunning_event(struct perf_event *event) (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START); } +/* Check and reject invalid config */ +static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (is_freerunning_event(event)) + return 0; + + return -EINVAL; +} + static inline void uncore_disable_box(struct intel_uncore_box *box) { if (box->pmu->type->ops->disable_box) @@ -467,7 +489,7 @@ static inline void uncore_box_exit(struct intel_uncore_box *box) static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { - return (box->pkgid < 0); + return (box->dieid < 0); } static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) @@ -482,6 +504,9 @@ static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *ev struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu); u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event); +void uncore_mmio_exit_box(struct intel_uncore_box *box); +u64 uncore_mmio_read_counter(struct intel_uncore_box *box, + struct perf_event *event); void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); void uncore_pmu_event_start(struct perf_event *event, int flags); @@ -497,6 +522,7 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; +extern struct intel_uncore_type **uncore_mmio_uncores; extern struct pci_driver *uncore_pci_driver; extern raw_spinlock_t pci2phy_map_lock; extern struct list_head pci2phy_map_head; @@ -528,6 +554,9 @@ int knl_uncore_pci_init(void); void knl_uncore_cpu_init(void); int skx_uncore_pci_init(void); void skx_uncore_cpu_init(void); +int snr_uncore_pci_init(void); +void snr_uncore_cpu_init(void); +void snr_uncore_mmio_init(void); /* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index f8431819b3e1..dbaa1b088a30 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -3,27 +3,29 @@ #include "uncore.h" /* Uncore IMC PCI IDs */ -#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 -#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 -#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 -#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 -#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 -#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 -#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904 -#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c -#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900 -#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 -#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f -#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f -#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c -#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 -#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 -#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f -#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f -#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc -#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0 -#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10 -#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4 +#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 +#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 +#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 +#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 +#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904 +#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c +#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900 +#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 +#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f +#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f +#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c +#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 +#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 +#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f +#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f +#define PCI_DEVICE_ID_INTEL_KBL_HQ_IMC 0x5910 +#define PCI_DEVICE_ID_INTEL_KBL_WQ_IMC 0x5918 +#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc +#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0 +#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10 +#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4 #define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f #define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f #define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2 @@ -34,9 +36,15 @@ #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 +#define PCI_DEVICE_ID_INTEL_AML_YD_IMC 0x590c +#define PCI_DEVICE_ID_INTEL_AML_YQ_IMC 0x590d +#define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 +#define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 +#define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -420,11 +428,6 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } -static void snb_uncore_imc_exit_box(struct intel_uncore_box *box) -{ - iounmap(box->io_addr); -} - static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) {} @@ -437,13 +440,6 @@ static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct per static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) {} -static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); -} - /* * Keep the custom event_init() function compatible with old event * encoding for free running counters. @@ -570,13 +566,13 @@ static struct pmu snb_uncore_imc_pmu = { static struct intel_uncore_ops snb_uncore_imc_ops = { .init_box = snb_uncore_imc_init_box, - .exit_box = snb_uncore_imc_exit_box, + .exit_box = uncore_mmio_exit_box, .enable_box = snb_uncore_imc_enable_box, .disable_box = snb_uncore_imc_disable_box, .disable_event = snb_uncore_imc_disable_event, .enable_event = snb_uncore_imc_enable_event, .hw_config = snb_uncore_imc_hw_config, - .read_counter = snb_uncore_imc_read_counter, + .read_counter = uncore_mmio_read_counter, }; static struct intel_uncore_type snb_uncore_imc = { @@ -682,6 +678,14 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, @@ -737,6 +741,26 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -807,6 +831,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */ IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */ IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */ + IMC_DEV(KBL_HQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core H Quad Core */ + IMC_DEV(KBL_WQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S 4 cores Work Station */ IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */ IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */ IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */ @@ -821,6 +847,11 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ + IMC_DEV(AML_YD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core Y Mobile Dual Core */ + IMC_DEV(AML_YQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core Y Mobile Quad Core */ + IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ + IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ + IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b10e04387f38..b10a5ec79e48 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -324,12 +324,77 @@ #define SKX_M2M_PCI_PMON_CTR0 0x200 #define SKX_M2M_PCI_PMON_BOX_CTL 0x258 +/* SNR Ubox */ +#define SNR_U_MSR_PMON_CTR0 0x1f98 +#define SNR_U_MSR_PMON_CTL0 0x1f91 +#define SNR_U_MSR_PMON_UCLK_FIXED_CTL 0x1f93 +#define SNR_U_MSR_PMON_UCLK_FIXED_CTR 0x1f94 + +/* SNR CHA */ +#define SNR_CHA_RAW_EVENT_MASK_EXT 0x3ffffff +#define SNR_CHA_MSR_PMON_CTL0 0x1c01 +#define SNR_CHA_MSR_PMON_CTR0 0x1c08 +#define SNR_CHA_MSR_PMON_BOX_CTL 0x1c00 +#define SNR_C0_MSR_PMON_BOX_FILTER0 0x1c05 + + +/* SNR IIO */ +#define SNR_IIO_MSR_PMON_CTL0 0x1e08 +#define SNR_IIO_MSR_PMON_CTR0 0x1e01 +#define SNR_IIO_MSR_PMON_BOX_CTL 0x1e00 +#define SNR_IIO_MSR_OFFSET 0x10 +#define SNR_IIO_PMON_RAW_EVENT_MASK_EXT 0x7ffff + +/* SNR IRP */ +#define SNR_IRP0_MSR_PMON_CTL0 0x1ea8 +#define SNR_IRP0_MSR_PMON_CTR0 0x1ea1 +#define SNR_IRP0_MSR_PMON_BOX_CTL 0x1ea0 +#define SNR_IRP_MSR_OFFSET 0x10 + +/* SNR M2PCIE */ +#define SNR_M2PCIE_MSR_PMON_CTL0 0x1e58 +#define SNR_M2PCIE_MSR_PMON_CTR0 0x1e51 +#define SNR_M2PCIE_MSR_PMON_BOX_CTL 0x1e50 +#define SNR_M2PCIE_MSR_OFFSET 0x10 + +/* SNR PCU */ +#define SNR_PCU_MSR_PMON_CTL0 0x1ef1 +#define SNR_PCU_MSR_PMON_CTR0 0x1ef8 +#define SNR_PCU_MSR_PMON_BOX_CTL 0x1ef0 +#define SNR_PCU_MSR_PMON_BOX_FILTER 0x1efc + +/* SNR M2M */ +#define SNR_M2M_PCI_PMON_CTL0 0x468 +#define SNR_M2M_PCI_PMON_CTR0 0x440 +#define SNR_M2M_PCI_PMON_BOX_CTL 0x438 +#define SNR_M2M_PCI_PMON_UMASK_EXT 0xff + +/* SNR PCIE3 */ +#define SNR_PCIE3_PCI_PMON_CTL0 0x508 +#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8 +#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e4 + +/* SNR IMC */ +#define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54 +#define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38 +#define SNR_IMC_MMIO_PMON_CTL0 0x40 +#define SNR_IMC_MMIO_PMON_CTR0 0x8 +#define SNR_IMC_MMIO_PMON_BOX_CTL 0x22800 +#define SNR_IMC_MMIO_OFFSET 0x4000 +#define SNR_IMC_MMIO_SIZE 0x4000 +#define SNR_IMC_MMIO_BASE_OFFSET 0xd0 +#define SNR_IMC_MMIO_BASE_MASK 0x1FFFFFFF +#define SNR_IMC_MMIO_MEM0_OFFSET 0xd8 +#define SNR_IMC_MMIO_MEM0_MASK 0x7FF + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); @@ -343,11 +408,14 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43"); +DEFINE_UNCORE_FORMAT_ATTR(ch_mask2, ch_mask, "config:36-47"); DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46"); +DEFINE_UNCORE_FORMAT_ATTR(fc_mask2, fc_mask, "config:48-50"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid5, filter_tid, "config1:0-9"); DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); @@ -1058,8 +1126,8 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve if (reg1->idx != EXTRA_REG_NONE) { int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; - int pkg = box->pkgid; - struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx]; + int die = box->dieid; + struct pci_dev *filter_pdev = uncore_extra_pci_dev[die].dev[idx]; if (filter_pdev) { pci_write_config_dword(filter_pdev, reg1->reg, @@ -3585,6 +3653,7 @@ static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = { static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = { .read_counter = uncore_msr_read_counter, + .hw_config = uncore_freerunning_hw_config, }; static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = { @@ -3967,3 +4036,535 @@ int skx_uncore_pci_init(void) } /* end of SKX uncore support */ + +/* SNR uncore support */ + +static struct intel_uncore_type snr_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = SNR_U_MSR_PMON_CTR0, + .event_ctl = SNR_U_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNR_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNR_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static struct attribute *snr_uncore_cha_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext2.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid5.attr, + NULL, +}; +static const struct attribute_group snr_uncore_chabox_format_group = { + .name = "format", + .attrs = snr_uncore_cha_formats_attr, +}; + +static int snr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + reg1->reg = SNR_C0_MSR_PMON_BOX_FILTER0 + + box->pmu->type->msr_offset * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID; + reg1->idx = 0; + + return 0; +} + +static void snr_cha_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snr_uncore_chabox_ops = { + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snr_cha_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = snr_cha_hw_config, +}; + +static struct intel_uncore_type snr_uncore_chabox = { + .name = "cha", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .event_ctl = SNR_CHA_MSR_PMON_CTL0, + .perf_ctr = SNR_CHA_MSR_PMON_CTR0, + .box_ctl = SNR_CHA_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT, + .ops = &snr_uncore_chabox_ops, + .format_group = &snr_uncore_chabox_format_group, +}; + +static struct attribute *snr_uncore_iio_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh9.attr, + &format_attr_ch_mask2.attr, + &format_attr_fc_mask2.attr, + NULL, +}; + +static const struct attribute_group snr_uncore_iio_format_group = { + .name = "format", + .attrs = snr_uncore_iio_formats_attr, +}; + +static struct intel_uncore_type snr_uncore_iio = { + .name = "iio", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SNR_IIO_MSR_PMON_CTL0, + .perf_ctr = SNR_IIO_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL, + .msr_offset = SNR_IIO_MSR_OFFSET, + .ops = &ivbep_uncore_msr_ops, + .format_group = &snr_uncore_iio_format_group, +}; + +static struct intel_uncore_type snr_uncore_irp = { + .name = "irp", + .num_counters = 2, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SNR_IRP0_MSR_PMON_CTL0, + .perf_ctr = SNR_IRP0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_IRP0_MSR_PMON_BOX_CTL, + .msr_offset = SNR_IRP_MSR_OFFSET, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static struct intel_uncore_type snr_uncore_m2pcie = { + .name = "m2pcie", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SNR_M2PCIE_MSR_PMON_CTL0, + .perf_ctr = SNR_M2PCIE_MSR_PMON_CTR0, + .box_ctl = SNR_M2PCIE_MSR_PMON_BOX_CTL, + .msr_offset = SNR_M2PCIE_MSR_OFFSET, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static int snr_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; + + if (ev_sel >= 0xb && ev_sel <= 0xe) { + reg1->reg = SNR_PCU_MSR_PMON_BOX_FILTER; + reg1->idx = ev_sel - 0xb; + reg1->config = event->attr.config1 & (0xff << reg1->idx); + } + return 0; +} + +static struct intel_uncore_ops snr_uncore_pcu_ops = { + IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snr_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type snr_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCU_MSR_PMON_CTR0, + .event_ctl = SNR_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snr_uncore_pcu_ops, + .format_group = &skx_uncore_pcu_format_group, +}; + +enum perf_uncore_snr_iio_freerunning_type_id { + SNR_IIO_MSR_IOCLK, + SNR_IIO_MSR_BW_IN, + + SNR_IIO_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters snr_iio_freerunning[] = { + [SNR_IIO_MSR_IOCLK] = { 0x1eac, 0x1, 0x10, 1, 48 }, + [SNR_IIO_MSR_BW_IN] = { 0x1f00, 0x1, 0x10, 8, 48 }, +}; + +static struct uncore_event_desc snr_uncore_iio_freerunning_events[] = { + /* Free-Running IIO CLOCKS Counter */ + INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), + /* Free-Running IIO BANDWIDTH IN Counters */ + INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type snr_uncore_iio_free_running = { + .name = "iio_free_running", + .num_counters = 9, + .num_boxes = 5, + .num_freerunning_types = SNR_IIO_FREERUNNING_TYPE_MAX, + .freerunning = snr_iio_freerunning, + .ops = &skx_uncore_iio_freerunning_ops, + .event_descs = snr_uncore_iio_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +static struct intel_uncore_type *snr_msr_uncores[] = { + &snr_uncore_ubox, + &snr_uncore_chabox, + &snr_uncore_iio, + &snr_uncore_irp, + &snr_uncore_m2pcie, + &snr_uncore_pcu, + &snr_uncore_iio_free_running, + NULL, +}; + +void snr_uncore_cpu_init(void) +{ + uncore_msr_uncores = snr_msr_uncores; +} + +static void snr_m2m_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, box_ctl, IVBEP_PMON_BOX_CTL_INT); +} + +static struct intel_uncore_ops snr_m2m_uncore_pci_ops = { + .init_box = snr_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct attribute *snr_m2m_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext3.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static const struct attribute_group snr_m2m_uncore_format_group = { + .name = "format", + .attrs = snr_m2m_uncore_formats_attr, +}; + +static struct intel_uncore_type snr_uncore_m2m = { + .name = "m2m", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_M2M_PCI_PMON_CTR0, + .event_ctl = SNR_M2M_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT, + .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL, + .ops = &snr_m2m_uncore_pci_ops, + .format_group = &snr_m2m_uncore_format_group, +}; + +static struct intel_uncore_type snr_uncore_pcie3 = { + .name = "pcie3", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, + .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &ivbep_uncore_format_group, +}; + +enum { + SNR_PCI_UNCORE_M2M, + SNR_PCI_UNCORE_PCIE3, +}; + +static struct intel_uncore_type *snr_pci_uncores[] = { + [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m, + [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, + NULL, +}; + +static const struct pci_device_id snr_uncore_pci_ids[] = { + { /* M2M */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, SNR_PCI_UNCORE_M2M, 0), + }, + { /* PCIe3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snr_uncore_pci_driver = { + .name = "snr_uncore", + .id_table = snr_uncore_pci_ids, +}; + +int snr_uncore_pci_init(void) +{ + /* SNR UBOX DID */ + int ret = snbep_pci2phy_map_init(0x3460, SKX_CPUNODEID, + SKX_GIDNIDMAP, true); + + if (ret) + return ret; + + uncore_pci_uncores = snr_pci_uncores; + uncore_pci_driver = &snr_uncore_pci_driver; + return 0; +} + +static struct pci_dev *snr_uncore_get_mc_dev(int id) +{ + struct pci_dev *mc_dev = NULL; + int phys_id, pkg; + + while (1) { + mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev); + if (!mc_dev) + break; + phys_id = uncore_pcibus_to_physid(mc_dev->bus); + if (phys_id < 0) + continue; + pkg = topology_phys_to_logical_pkg(phys_id); + if (pkg < 0) + continue; + else if (pkg == id) + break; + } + return mc_dev; +} + +static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); + unsigned int box_ctl = uncore_mmio_box_ctl(box); + resource_size_t addr; + u32 pci_dword; + + if (!pdev) + return; + + pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword); + addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; + + pci_read_config_dword(pdev, SNR_IMC_MMIO_MEM0_OFFSET, &pci_dword); + addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12; + + addr += box_ctl; + + box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); + if (!box->io_addr) + return; + + writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); +} + +static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + u32 config; + + if (!box->io_addr) + return; + + config = readl(box->io_addr); + config |= SNBEP_PMON_BOX_CTL_FRZ; + writel(config, box->io_addr); +} + +static void snr_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + u32 config; + + if (!box->io_addr) + return; + + config = readl(box->io_addr); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + writel(config, box->io_addr); +} + +static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config | SNBEP_PMON_CTL_EN, + box->io_addr + hwc->config_base); +} + +static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config, box->io_addr + hwc->config_base); +} + +static struct intel_uncore_ops snr_uncore_mmio_ops = { + .init_box = snr_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = snr_uncore_mmio_disable_box, + .enable_box = snr_uncore_mmio_enable_box, + .disable_event = snr_uncore_mmio_disable_event, + .enable_event = snr_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static struct uncore_event_desc snr_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type snr_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .event_descs = snr_uncore_imc_events, + .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, + .event_ctl = SNR_IMC_MMIO_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, + .mmio_offset = SNR_IMC_MMIO_OFFSET, + .ops = &snr_uncore_mmio_ops, + .format_group = &skx_uncore_format_group, +}; + +enum perf_uncore_snr_imc_freerunning_type_id { + SNR_IMC_DCLK, + SNR_IMC_DDR, + + SNR_IMC_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters snr_imc_freerunning[] = { + [SNR_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 }, + [SNR_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 }, +}; + +static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { + INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), + + INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), +}; + +static struct intel_uncore_ops snr_uncore_imc_freerunning_ops = { + .init_box = snr_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type snr_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 1, + .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .freerunning = snr_imc_freerunning, + .ops = &snr_uncore_imc_freerunning_ops, + .event_descs = snr_uncore_imc_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +static struct intel_uncore_type *snr_mmio_uncores[] = { + &snr_uncore_imc, + &snr_uncore_imc_free_running, + NULL, +}; + +void snr_uncore_mmio_init(void) +{ + uncore_mmio_uncores = snr_mmio_uncores; +} + +/* end of SNR uncore support */ diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index f3f4c2263501..9431447541e9 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> +#include <linux/sysfs.h> #include <linux/nospec.h> #include <asm/intel-family.h> +#include "probe.h" enum perf_msr_id { PERF_MSR_TSC = 0, @@ -12,32 +14,30 @@ enum perf_msr_id { PERF_MSR_PTSC = 5, PERF_MSR_IRPERF = 6, PERF_MSR_THERM = 7, - PERF_MSR_THERM_SNAP = 8, - PERF_MSR_THERM_UNIT = 9, PERF_MSR_EVENT_MAX, }; -static bool test_aperfmperf(int idx) +static bool test_aperfmperf(int idx, void *data) { return boot_cpu_has(X86_FEATURE_APERFMPERF); } -static bool test_ptsc(int idx) +static bool test_ptsc(int idx, void *data) { return boot_cpu_has(X86_FEATURE_PTSC); } -static bool test_irperf(int idx) +static bool test_irperf(int idx, void *data) { return boot_cpu_has(X86_FEATURE_IRPERF); } -static bool test_therm_status(int idx) +static bool test_therm_status(int idx, void *data) { return boot_cpu_has(X86_FEATURE_DTHERM); } -static bool test_intel(int idx) +static bool test_intel(int idx, void *data) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) @@ -98,37 +98,51 @@ static bool test_intel(int idx) return false; } -struct perf_msr { - u64 msr; - struct perf_pmu_events_attr *attr; - bool (*test)(int idx); +PMU_EVENT_ATTR_STRING(tsc, attr_tsc, "event=0x00" ); +PMU_EVENT_ATTR_STRING(aperf, attr_aperf, "event=0x01" ); +PMU_EVENT_ATTR_STRING(mperf, attr_mperf, "event=0x02" ); +PMU_EVENT_ATTR_STRING(pperf, attr_pperf, "event=0x03" ); +PMU_EVENT_ATTR_STRING(smi, attr_smi, "event=0x04" ); +PMU_EVENT_ATTR_STRING(ptsc, attr_ptsc, "event=0x05" ); +PMU_EVENT_ATTR_STRING(irperf, attr_irperf, "event=0x06" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin, attr_therm, "event=0x07" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, attr_therm_snap, "1" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, attr_therm_unit, "C" ); + +static unsigned long msr_mask; + +PMU_EVENT_GROUP(events, aperf); +PMU_EVENT_GROUP(events, mperf); +PMU_EVENT_GROUP(events, pperf); +PMU_EVENT_GROUP(events, smi); +PMU_EVENT_GROUP(events, ptsc); +PMU_EVENT_GROUP(events, irperf); + +static struct attribute *attrs_therm[] = { + &attr_therm.attr.attr, + &attr_therm_snap.attr.attr, + &attr_therm_unit.attr.attr, + NULL, }; -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00" ); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01" ); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02" ); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03" ); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04" ); -PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05" ); -PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06" ); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07" ); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1" ); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C" ); +static struct attribute_group group_therm = { + .name = "events", + .attrs = attrs_therm, +}; static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, - [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, - [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, - [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, }, - [PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, }, - [PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, }, + [PERF_MSR_TSC] = { .no_check = true, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &group_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &group_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &group_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &group_smi, test_intel, }, + [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &group_ptsc, test_ptsc, }, + [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &group_irperf, test_irperf, }, + [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &group_therm, test_therm_status, }, }; -static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { +static struct attribute *events_attrs[] = { + &attr_tsc.attr.attr, NULL, }; @@ -153,6 +167,17 @@ static const struct attribute_group *attr_groups[] = { NULL, }; +const struct attribute_group *attr_update[] = { + &group_aperf, + &group_mperf, + &group_pperf, + &group_smi, + &group_ptsc, + &group_irperf, + &group_therm, + NULL, +}; + static int msr_event_init(struct perf_event *event) { u64 cfg = event->attr.config; @@ -169,7 +194,7 @@ static int msr_event_init(struct perf_event *event) cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX); - if (!msr[cfg].attr) + if (!(msr_mask & (1 << cfg))) return -EINVAL; event->hw.idx = -1; @@ -252,32 +277,17 @@ static struct pmu pmu_msr = { .stop = msr_event_stop, .read = msr_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = attr_update, }; static int __init msr_init(void) { - int i, j = 0; - if (!boot_cpu_has(X86_FEATURE_TSC)) { pr_cont("no MSR PMU driver.\n"); return 0; } - /* Probe the MSRs. */ - for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { - u64 val; - - /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining MSRs in the sysfs attrs. */ - for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; - } - events_attrs[j] = NULL; + msr_mask = perf_msr_probe(msr, PERF_MSR_EVENT_MAX, true, NULL); perf_pmu_register(&pmu_msr, "msr", -1); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4e346856ee19..8751008fc170 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -613,14 +613,11 @@ struct x86_pmu { int attr_rdpmc_broken; int attr_rdpmc; struct attribute **format_attrs; - struct attribute **event_attrs; - struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); - struct attribute **cpu_events; + const struct attribute_group **attr_update; unsigned long attr_freeze_on_smi; - struct attribute **attrs; /* * CPU Hotplug hooks @@ -886,8 +883,6 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); -struct attribute **merge_attr(struct attribute **a, struct attribute **b); - ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c new file mode 100644 index 000000000000..c2ede2f3b277 --- /dev/null +++ b/arch/x86/events/probe.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/export.h> +#include <linux/types.h> +#include <linux/bits.h> +#include "probe.h" + +static umode_t +not_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return 0; +} + +unsigned long +perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) +{ + unsigned long avail = 0; + unsigned int bit; + u64 val; + + if (cnt >= BITS_PER_LONG) + return 0; + + for (bit = 0; bit < cnt; bit++) { + if (!msr[bit].no_check) { + struct attribute_group *grp = msr[bit].grp; + + grp->is_visible = not_visible; + + if (msr[bit].test && !msr[bit].test(bit, data)) + continue; + /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ + if (rdmsrl_safe(msr[bit].msr, &val)) + continue; + /* Disable zero counters if requested. */ + if (!zero && !val) + continue; + + grp->is_visible = NULL; + } + avail |= BIT(bit); + } + + return avail; +} +EXPORT_SYMBOL_GPL(perf_msr_probe); diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h new file mode 100644 index 000000000000..4c8e0afc5fb5 --- /dev/null +++ b/arch/x86/events/probe.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_X86_EVENTS_PROBE_H__ +#define __ARCH_X86_EVENTS_PROBE_H__ +#include <linux/sysfs.h> + +struct perf_msr { + u64 msr; + struct attribute_group *grp; + bool (*test)(int idx, void *data); + bool no_check; +}; + +unsigned long +perf_msr_probe(struct perf_msr *msr, int cnt, bool no_zero, void *data); + +#define __PMU_EVENT_GROUP(_name) \ +static struct attribute *attrs_##_name[] = { \ + &attr_##_name.attr.attr, \ + NULL, \ +} + +#define PMU_EVENT_GROUP(_grp, _name) \ +__PMU_EVENT_GROUP(_name); \ +static struct attribute_group group_##_name = { \ + .name = #_grp, \ + .attrs = attrs_##_name, \ +} + +#endif /* __ARCH_X86_EVENTS_PROBE_H__ */ diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1608050e9df9..0e033ef11a9f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -17,64 +17,13 @@ #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/clockchips.h> #include <linux/hyperv.h> #include <linux/slab.h> #include <linux/cpuhotplug.h> - -#ifdef CONFIG_HYPERV_TSCPAGE - -static struct ms_hyperv_tsc_page *tsc_pg; - -struct ms_hyperv_tsc_page *hv_get_tsc_page(void) -{ - return tsc_pg; -} -EXPORT_SYMBOL_GPL(hv_get_tsc_page); - -static u64 read_hv_clock_tsc(struct clocksource *arg) -{ - u64 current_tick = hv_read_tsc_page(tsc_pg); - - if (current_tick == U64_MAX) - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - - return current_tick; -} - -static struct clocksource hyperv_cs_tsc = { - .name = "hyperv_clocksource_tsc_page", - .rating = 400, - .read = read_hv_clock_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; -#endif - -static u64 read_hv_clock_msr(struct clocksource *arg) -{ - u64 current_tick; - /* - * Read the partition counter to get the current tick count. This count - * is set to 0 when the partition is created and is incremented in - * 100 nanosecond units. - */ - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - return current_tick; -} - -static struct clocksource hyperv_cs_msr = { - .name = "hyperv_clocksource_msr", - .rating = 400, - .read = read_hv_clock_msr, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; +#include <clocksource/hyperv_timer.h> void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); -struct clocksource *hyperv_cs; -EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); @@ -343,42 +292,8 @@ void __init hyperv_init(void) x86_init.pci.arch_init = hv_pci_init; - /* - * Register Hyper-V specific clocksource. - */ -#ifdef CONFIG_HYPERV_TSCPAGE - if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) { - union hv_x64_msr_hypercall_contents tsc_msr; - - tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); - if (!tsc_pg) - goto register_msr_cs; - - hyperv_cs = &hyperv_cs_tsc; - - rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); - - tsc_msr.enable = 1; - tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); - - wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); - - hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK; - - clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); - return; - } -register_msr_cs: -#endif - /* - * For 32 bit guests just use the MSR based mechanism for reading - * the partition counter. - */ - - hyperv_cs = &hyperv_cs_msr; - if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) - clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); - + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); return; remove_cpuhp_state: diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 629d1ee05599..1cee10091b9f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -358,7 +358,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); /* Create the ucontext. */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (static_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index a43212036257..64a6c952091e 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls_val, int __user *, child_tidptr) { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, - tls_val); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + return _do_fork(&args); } diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h new file mode 100644 index 000000000000..4adb13f08af7 --- /dev/null +++ b/arch/x86/include/asm/acrn.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ACRN_H +#define _ASM_X86_ACRN_H + +extern void acrn_hv_callback_vector(void); +#ifdef CONFIG_TRACING +#define trace_acrn_hv_callback_vector acrn_hv_callback_vector +#endif + +extern void acrn_hv_vector_handler(struct pt_regs *regs); +#endif /* _ASM_X86_ACRN_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1340fa53b575..050e5f9ebf81 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -53,7 +53,7 @@ extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; -extern unsigned int lapic_timer_frequency; +extern unsigned int lapic_timer_period; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { @@ -155,7 +155,6 @@ static inline int apic_force_enable(unsigned long addr) extern int apic_force_enable(unsigned long addr); #endif -extern void apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); /* @@ -175,6 +174,7 @@ extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); extern void lapic_online(void); extern void lapic_offline(void); +extern bool apic_needs_pit(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } @@ -188,6 +188,7 @@ static inline void init_bsp_APIC(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } +static inline bool apic_needs_pit(void) { return true; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index ea3d95275b43..115127c7ad28 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -54,7 +54,7 @@ static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -68,7 +68,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -95,7 +95,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc @@ -108,7 +108,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v) static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6a5b0ec460da..52cfaecb13f9 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -9,7 +9,7 @@ /* An 64bit atomic type */ typedef struct { - u64 __aligned(8) counter; + s64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } @@ -71,8 +71,7 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, - long long n) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } @@ -85,9 +84,9 @@ static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) +static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { - long long o; + s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; alternative_atomic64(xchg, "=&A" (o), @@ -103,7 +102,7 @@ static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) * * Atomically sets the value of @v to @n. */ -static inline void arch_atomic64_set(atomic64_t *v, long long i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -118,9 +117,9 @@ static inline void arch_atomic64_set(atomic64_t *v, long long i) * * Atomically reads the value of @v and returns it. */ -static inline long long arch_atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { - long long r; + s64 r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; } @@ -132,7 +131,7 @@ static inline long long arch_atomic64_read(const atomic64_t *v) * * Atomically adds @i to @v and returns @i + *@v */ -static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) +static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -143,7 +142,7 @@ static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) /* * Other variants with different arithmetic operators: */ -static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) +static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -151,18 +150,18 @@ static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) return i; } -static inline long long arch_atomic64_inc_return(atomic64_t *v) +static inline s64 arch_atomic64_inc_return(atomic64_t *v) { - long long a; + s64 a; alternative_atomic64(inc_return, "=&A" (a), "S" (v) : "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return -static inline long long arch_atomic64_dec_return(atomic64_t *v) +static inline s64 arch_atomic64_dec_return(atomic64_t *v) { - long long a; + s64 a; alternative_atomic64(dec_return, "=&A" (a), "S" (v) : "memory", "ecx"); return a; @@ -176,7 +175,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v) * * Atomically adds @i to @v. */ -static inline long long arch_atomic64_add(long long i, atomic64_t *v) +static inline s64 arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -191,7 +190,7 @@ static inline long long arch_atomic64_add(long long i, atomic64_t *v) * * Atomically subtracts @i from @v. */ -static inline long long arch_atomic64_sub(long long i, atomic64_t *v) +static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -234,8 +233,7 @@ static inline void arch_atomic64_dec(atomic64_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, - long long u) +static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -254,9 +252,9 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) +static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { - long long r; + s64 r; alternative_atomic64(dec_if_positive, "=&A" (r), "S" (v) : "ecx", "memory"); return r; @@ -266,17 +264,17 @@ static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void arch_atomic64_and(long long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; } -static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; @@ -284,17 +282,17 @@ static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) return old; } -static inline void arch_atomic64_or(long long i, atomic64_t *v) +static inline void arch_atomic64_or(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; } -static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; @@ -302,17 +300,17 @@ static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) return old; } -static inline void arch_atomic64_xor(long long i, atomic64_t *v) +static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; } -static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; @@ -320,9 +318,9 @@ static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) return old; } -static inline long long arch_atomic64_fetch_add(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) c = old; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index dadc20adba21..95c6ceac66b9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -17,7 +17,7 @@ * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline long arch_atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { return READ_ONCE((v)->counter); } @@ -29,7 +29,7 @@ static inline long arch_atomic64_read(const atomic64_t *v) * * Atomically sets the value of @v to @i. */ -static inline void arch_atomic64_set(atomic64_t *v, long i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { WRITE_ONCE(v->counter, i); } @@ -41,11 +41,11 @@ static inline void arch_atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static __always_inline void arch_atomic64_add(long i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -55,11 +55,11 @@ static __always_inline void arch_atomic64_add(long i, atomic64_t *v) * * Atomically subtracts @i from @v. */ -static inline void arch_atomic64_sub(long i, atomic64_t *v) +static inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } @@ -87,7 +87,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } #define arch_atomic64_inc arch_atomic64_inc @@ -101,7 +101,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } #define arch_atomic64_dec arch_atomic64_dec @@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) +static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } @@ -155,43 +155,43 @@ static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } -static inline long arch_atomic64_sub_return(long i, atomic64_t *v) +static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } -static inline long arch_atomic64_fetch_add(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } -static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } -static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg -static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { return try_cmpxchg(&v->counter, old, new); } -static inline long arch_atomic64_xchg(atomic64_t *v, long new) +static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } -static inline void arch_atomic64_and(long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) @@ -199,7 +199,7 @@ static inline void arch_atomic64_and(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -208,7 +208,7 @@ static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) return val; } -static inline void arch_atomic64_or(long i, atomic64_t *v) +static inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) @@ -216,7 +216,7 @@ static inline void arch_atomic64_or(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -225,7 +225,7 @@ static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) return val; } -static inline void arch_atomic64_xor(long i, atomic64_t *v) +static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) @@ -233,7 +233,7 @@ static inline void arch_atomic64_xor(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_xor(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 14de0432d288..84f848c2541a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -80,8 +80,8 @@ do { \ }) /* Atomic operations are already serializing on x86 */ -#define __smp_mb__before_atomic() barrier() -#define __smp_mb__after_atomic() barrier() +#define __smp_mb__before_atomic() do { } while (0) +#define __smp_mb__after_atomic() do { } while (0) #include <asm-generic/barrier.h> diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 8e790ec219a5..ba15d53c1ca7 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -49,23 +49,8 @@ #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) -/** - * set_bit - Atomically set a bit in memory - * @nr: the bit to set - * @addr: the address to start counting from - * - * This function is atomic and may not be reordered. See __set_bit() - * if you do not require the atomic guarantees. - * - * Note: there are no guarantees that this function will not be reordered - * on non x86 architectures, so if you are writing portable code, - * make sure not to rely on its reordering guarantees. - * - * Note that @nr may be almost arbitrarily large; this function is not - * restricted to acting on a single-word quantity. - */ static __always_inline void -set_bit(long nr, volatile unsigned long *addr) +arch_set_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" @@ -78,32 +63,14 @@ set_bit(long nr, volatile unsigned long *addr) } } -/** - * __set_bit - Set a bit in memory - * @nr: the bit to set - * @addr: the address to start counting from - * - * Unlike set_bit(), this function is non-atomic and may be reordered. - * If it's called on the same region of memory simultaneously, the effect - * may be that only one operation succeeds. - */ -static __always_inline void __set_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch___set_bit(long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -/** - * clear_bit - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * clear_bit() is atomic and may not be reordered. However, it does - * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() - * in order to ensure changes are visible on other processors. - */ static __always_inline void -clear_bit(long nr, volatile unsigned long *addr) +arch_clear_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" @@ -115,26 +82,21 @@ clear_bit(long nr, volatile unsigned long *addr) } } -/* - * clear_bit_unlock - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * clear_bit() is atomic and implies release semantics before the memory - * operation. It can be used for an unlock. - */ -static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void +arch_clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); - clear_bit(nr, addr); + arch_clear_bit(nr, addr); } -static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch___clear_bit(long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; asm volatile(LOCK_PREFIX "andb %2,%1" @@ -143,48 +105,23 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile : "ir" ((char) ~(1 << nr)) : "memory"); return negative; } +#define arch_clear_bit_unlock_is_negative_byte \ + arch_clear_bit_unlock_is_negative_byte -// Let everybody know we have it -#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte - -/* - * __clear_bit_unlock - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * __clear_bit() is non-atomic and implies release semantics before the memory - * operation. It can be used for an unlock if no other CPUs can concurrently - * modify other bits in the word. - */ -static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void +arch___clear_bit_unlock(long nr, volatile unsigned long *addr) { - __clear_bit(nr, addr); + arch___clear_bit(nr, addr); } -/** - * __change_bit - Toggle a bit in memory - * @nr: the bit to change - * @addr: the address to start counting from - * - * Unlike change_bit(), this function is non-atomic and may be reordered. - * If it's called on the same region of memory simultaneously, the effect - * may be that only one operation succeeds. - */ -static __always_inline void __change_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch___change_bit(long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -/** - * change_bit - Toggle a bit in memory - * @nr: Bit to change - * @addr: Address to start counting from - * - * change_bit() is atomic and may not be reordered. - * Note that @nr may be almost arbitrarily large; this function is not - * restricted to acting on a single-word quantity. - */ -static __always_inline void change_bit(long nr, volatile unsigned long *addr) +static __always_inline void +arch_change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -196,42 +133,20 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) } } -/** - * test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ -static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_test_and_set_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } -/** - * test_and_set_bit_lock - Set a bit and return its old value for lock - * @nr: Bit to set - * @addr: Address to count from - * - * This is the same as test_and_set_bit on x86. - */ static __always_inline bool -test_and_set_bit_lock(long nr, volatile unsigned long *addr) +arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) { - return test_and_set_bit(nr, addr); + return arch_test_and_set_bit(nr, addr); } -/** - * __test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation is non-atomic and can be reordered. - * If two examples of this operation race, one can appear to succeed - * but actually fail. You must protect multiple accesses with a lock. - */ -static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_set_bit(long nr, volatile unsigned long *addr) { bool oldbit; @@ -242,28 +157,13 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * return oldbit; } -/** - * test_and_clear_bit - Clear a bit and return its old value - * @nr: Bit to clear - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ -static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_test_and_clear_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } -/** - * __test_and_clear_bit - Clear a bit and return its old value - * @nr: Bit to clear - * @addr: Address to count from - * - * This operation is non-atomic and can be reordered. - * If two examples of this operation race, one can appear to succeed - * but actually fail. You must protect multiple accesses with a lock. - * +/* * Note: the operation is performed atomically with respect to * the local CPU, but not other CPUs. Portable code should not * rely on this behaviour. @@ -271,7 +171,8 @@ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long * * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_clear_bit(long nr, volatile unsigned long *addr) { bool oldbit; @@ -282,8 +183,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long return oldbit; } -/* WARNING: non atomic and it can be reordered! */ -static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_change_bit(long nr, volatile unsigned long *addr) { bool oldbit; @@ -295,15 +196,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon return oldbit; } -/** - * test_and_change_bit - Change a bit and return its old value - * @nr: Bit to change - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ -static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +arch_test_and_change_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } @@ -326,16 +220,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l return oldbit; } -#if 0 /* Fool kernel-doc since it doesn't do macros yet */ -/** - * test_bit - Determine whether a bit is set - * @nr: bit number to test - * @addr: Address to start counting from - */ -static bool test_bit(int nr, const volatile unsigned long *addr); -#endif - -#define test_bit(nr, addr) \ +#define arch_test_bit(nr, addr) \ (__builtin_constant_p((nr)) \ ? constant_test_bit((nr), (addr)) \ : variable_test_bit((nr), (addr))) @@ -504,6 +389,8 @@ static __always_inline int fls64(__u64 x) #include <asm-generic/bitops/const_hweight.h> +#include <asm-generic/bitops-instrumented.h> + #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index f6f6ef436599..101eb944f13c 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -24,7 +24,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear * this field. The purpose of this field is to guarantee * compliance with the x86 boot spec located in - * Documentation/x86/boot.txt . That spec says that the + * Documentation/x86/boot.rst . That spec says that the * *whole* structure should be cleared, after which only the * portion defined by struct setup_header (boot_params->hdr) * should be copied in. diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d337c51f7e6..58acda503817 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,8 +22,8 @@ enum cpuid_leafs CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, - CPUID_F_0_EDX, - CPUID_F_1_EDX, + CPUID_LNX_4, + CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 75f27ee2c263..998c2cc08363 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -239,12 +239,14 @@ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ @@ -269,13 +271,19 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0xf, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ @@ -322,6 +330,7 @@ #define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7e42b285c856..c6136d79f8c0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -47,7 +47,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); -void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 5cbce6fbb534..296b346184b2 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -22,6 +22,35 @@ pop %_ASM_BP .endm +#ifdef CONFIG_X86_64 +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts + * the original rbp. + */ +.macro ENCODE_FRAME_POINTER ptregs_offset=0 + leaq 1+\ptregs_offset(%rsp), %rbp +.endm +#else /* !CONFIG_X86_64 */ +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just clearing the MSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the + * original ebp. + */ +.macro ENCODE_FRAME_POINTER + mov %esp, %ebp + andl $0x7fffffff, %ebp +.endm +#endif /* CONFIG_X86_64 */ + #else /* !__ASSEMBLY__ */ #define FRAME_BEGIN \ @@ -30,12 +59,32 @@ #define FRAME_END "pop %" _ASM_BP "\n" +#ifdef CONFIG_X86_64 +#define ENCODE_FRAME_POINTER \ + "lea 1(%rsp), %rbp\n\t" +#else /* !CONFIG_X86_64 */ +#define ENCODE_FRAME_POINTER \ + "movl %esp, %ebp\n\t" \ + "andl $0x7fffffff, %ebp\n\t" +#endif /* CONFIG_X86_64 */ + #endif /* __ASSEMBLY__ */ #define FRAME_OFFSET __ASM_SEL(4, 8) #else /* !CONFIG_FRAME_POINTER */ +#ifdef __ASSEMBLY__ + +.macro ENCODE_FRAME_POINTER ptregs_offset=0 +.endm + +#else /* !__ASSEMBLY */ + +#define ENCODE_FRAME_POINTER + +#endif + #define FRAME_BEGIN #define FRAME_END #define FRAME_OFFSET 0 diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index d9069bb26c7f..07533795b8d2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -37,7 +37,7 @@ typedef struct { #ifdef CONFIG_X86_MCE_AMD unsigned int irq_deferred_error_count; #endif -#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR unsigned int irq_hv_callback_count; #endif #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 67385d56d4f4..6352dee37cda 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -75,16 +75,15 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; -struct hpet_dev; +struct hpet_channel; struct irq_domain; extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); -extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); +extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg); extern struct irq_domain *hpet_create_irq_domain(int hpet_id); extern int hpet_assign_irq(struct irq_domain *domain, - struct hpet_dev *dev, int dev_num); + struct hpet_channel *hc, int dev_num); #ifdef CONFIG_HPET_EMULATE_RTC diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 32e666e1231e..cbd97e22d2f3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -150,8 +150,11 @@ extern char irq_entries_start[]; #define trace_irq_entries_start irq_entries_start #endif +extern char spurious_entries_start[]; + #define VECTOR_UNUSED NULL -#define VECTOR_RETRIGGERED ((void *)~0UL) +#define VECTOR_SHUTDOWN ((void *)~0UL) +#define VECTOR_RETRIGGERED ((void *)~1UL) typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index cdf44aa9a501..af78cd72b8f3 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -401,6 +401,12 @@ enum HV_GENERIC_SET_FORMAT { #define HV_STATUS_INVALID_CONNECTION_ID 18 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 +/* + * The Hyper-V TimeRefCount register and the TSC + * page provide a guest VM clock with 100ns tick rate + */ +#define HV_CLOCK_HZ (NSEC_PER_SEC/100) + typedef struct _HV_REFERENCE_TSC_PAGE { __u32 tsc_sequence; __u32 res1; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 8c5aaba6633f..50a30f6c668b 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -29,6 +29,7 @@ enum x86_hypervisor_type { X86_HYPER_XEN_HVM, X86_HYPER_KVM, X86_HYPER_JAILHOUSE, + X86_HYPER_ACRN, }; #ifdef CONFIG_HYPERVISOR_GUEST diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 310118805f57..0278aa66ef62 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -56,6 +56,7 @@ #define INTEL_FAM6_ICELAKE_XEON_D 0x6C #define INTEL_FAM6_ICELAKE_DESKTOP 0x7D #define INTEL_FAM6_ICELAKE_MOBILE 0x7E +#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* "Small Core" Processors (Atom) */ @@ -76,6 +77,7 @@ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ + #define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h index 8f3bee821e6c..187ce59aea28 100644 --- a/arch/x86/include/asm/irq_regs.h +++ b/arch/x86/include/asm/irq_regs.h @@ -16,7 +16,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) { - return this_cpu_read(irq_regs); + return __this_cpu_read(irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) @@ -24,7 +24,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) struct pt_regs *old_regs; old_regs = get_irq_regs(); - this_cpu_write(irq_regs, new_regs); + __this_cpu_write(irq_regs, new_regs); return old_regs; } diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 65191ce8e1cf..06c3cc22a058 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H +#define HAVE_JUMP_LABEL_BATCH + #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 003f2daa3b0f..5e7d6b46de97 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -71,22 +71,6 @@ struct kimage; #define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* - * CPU does not save ss and sp on stack if execution is already - * running in kernel mode at the time of NMI occurrence. This code - * fixes it. - */ -static inline void crash_fixup_ss_esp(struct pt_regs *newregs, - struct pt_regs *oldregs) -{ -#ifdef CONFIG_X86_32 - newregs->sp = (unsigned long)&(oldregs->sp); - asm volatile("xorl %%eax, %%eax\n\t" - "movw %%ss, %%ax\n\t" - :"=a"(newregs->ss)); -#endif -} - -/* * This function is responsible for capturing register states if coming * via panic otherwise just fix up the ss and sp if coming via kernel * mode exception. @@ -96,7 +80,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs, { if (oldregs) { memcpy(newregs, oldregs, sizeof(*newregs)); - crash_fixup_ss_esp(newregs, oldregs); } else { #ifdef CONFIG_X86_32 asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 26d1eb83f72a..0cc5b611a113 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -686,6 +686,7 @@ struct kvm_vcpu_arch { u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; + u64 msr_ia32_power_ctl; u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ @@ -752,6 +753,8 @@ struct kvm_vcpu_arch { struct gfn_to_hva_cache data; } pv_eoi; + u64 msr_kvm_poll_control; + /* * Indicate whether the access faults on its page table in guest * which is set when fix page fault and used to detect unhandeable @@ -879,6 +882,7 @@ struct kvm_arch { bool mwait_in_guest; bool hlt_in_guest; bool pause_in_guest; + bool cstate_in_guest; unsigned long irq_sources_bitmap; s64 kvmclock_offset; @@ -926,6 +930,8 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + + struct kvm_pmu_event_filter *pmu_event_filter; }; struct kvm_vm_stat { @@ -996,7 +1002,7 @@ struct kvm_x86_ops { int (*disabled_by_bios)(void); /* __init */ int (*hardware_enable)(void); void (*hardware_disable)(void); - void (*check_processor_compatibility)(void *rtn); + int (*check_processor_compatibility)(void);/* __init */ int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); @@ -1110,7 +1116,7 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - void (*handle_external_intr)(struct kvm_vcpu *vcpu); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); @@ -1529,7 +1535,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5ff3e8af2c20..e78c7db87801 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -59,6 +59,7 @@ typedef struct { #define INIT_MM_CONTEXT(mm) \ .context = { \ .ctx_id = 1, \ + .lock = __MUTEX_INITIALIZER(mm.context.lock), \ } void leave_mm(int cpu); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index cc60e617931c..2ef31cc8c529 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -3,84 +3,15 @@ #define _ASM_X86_MSHYPER_H #include <linux/types.h> -#include <linux/atomic.h> #include <linux/nmi.h> #include <asm/io.h> #include <asm/hyperv-tlfs.h> #include <asm/nospec-branch.h> -#define VP_INVAL U32_MAX - -struct ms_hyperv_info { - u32 features; - u32 misc_features; - u32 hints; - u32 nested_features; - u32 max_vp_index; - u32 max_lp_index; -}; - -extern struct ms_hyperv_info ms_hyperv; - - typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, void *data); -/* - * Generate the guest ID. - */ - -static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version, - __u64 d_info2) -{ - __u64 guest_id = 0; - - guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 48); - guest_id |= (d_info1 << 48); - guest_id |= (kernel_version << 16); - guest_id |= d_info2; - - return guest_id; -} - - -/* Free the message slot and signal end-of-message if required */ -static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) -{ - /* - * On crash we're reading some other CPU's message page and we need - * to be careful: this other CPU may already had cleared the header - * and the host may already had delivered some other message there. - * In case we blindly write msg->header.message_type we're going - * to lose it. We can still lose a message of the same type but - * we count on the fact that there can only be one - * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages - * on crash. - */ - if (cmpxchg(&msg->header.message_type, old_msg_type, - HVMSG_NONE) != old_msg_type) - return; - - /* - * Make sure the write to MessageType (ie set to - * HVMSG_NONE) happens before we read the - * MessagePending and EOMing. Otherwise, the EOMing - * will not deliver any more messages since there is - * no empty slot - */ - mb(); - - if (msg->header.message_flags.msg_pending) { - /* - * This will cause message queue rescan to - * possibly deliver another msg from the - * hypervisor - */ - wrmsrl(HV_X64_MSR_EOM, 0); - } -} - #define hv_init_timer(timer, tick) \ wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick) #define hv_init_timer_config(timer, val) \ @@ -97,6 +28,8 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index) +#define hv_signal_eom() wrmsrl(HV_X64_MSR_EOM, 0) + #define hv_get_synint_state(int_num, val) \ rdmsrl(HV_X64_MSR_SINT0 + int_num, val) #define hv_set_synint_state(int_num, val) \ @@ -105,19 +38,23 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_get_crash_ctl(val) \ rdmsrl(HV_X64_MSR_CRASH_CTL, val) +#define hv_get_time_ref_count(val) \ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val) + +#define hv_get_reference_tsc(val) \ + rdmsrl(HV_X64_MSR_REFERENCE_TSC, val) +#define hv_set_reference_tsc(val) \ + wrmsrl(HV_X64_MSR_REFERENCE_TSC, val) +#define hv_set_clocksource_vdso(val) \ + ((val).archdata.vclock_mode = VCLOCK_HVCLOCK) +#define hv_get_raw_timer() rdtsc_ordered() + void hyperv_callback_vector(void); void hyperv_reenlightenment_vector(void); #ifdef CONFIG_TRACING #define trace_hyperv_callback_vector hyperv_callback_vector #endif void hyperv_vector_handler(struct pt_regs *regs); -void hv_setup_vmbus_irq(void (*handler)(void)); -void hv_remove_vmbus_irq(void); - -void hv_setup_kexec_handler(void (*handler)(void)); -void hv_remove_kexec_handler(void); -void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); -void hv_remove_crash_handler(void); /* * Routines for stimer0 Direct Mode handling. @@ -125,15 +62,12 @@ void hv_remove_crash_handler(void); */ void hv_stimer0_vector_handler(struct pt_regs *regs); void hv_stimer0_callback_vector(void); -int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)); -void hv_remove_stimer0_irq(int irq); static inline void hv_enable_stimer0_percpu_irq(int irq) {} static inline void hv_disable_stimer0_percpu_irq(int irq) {} #if IS_ENABLED(CONFIG_HYPERV) -extern struct clocksource *hyperv_cs; extern void *hv_hypercall_pg; extern void __percpu **hyperv_pcpu_input_arg; @@ -272,14 +206,6 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, return status; } -/* - * Hypervisor's notion of virtual processor ID is different from - * Linux' notion of CPU ID. This information can only be retrieved - * in the context of the calling CPU. Setup a map for easy access - * to this information. - */ -extern u32 *hv_vp_index; -extern u32 hv_max_vp_index; extern struct hv_vp_assist_page **hv_vp_assist_page; static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) @@ -290,63 +216,8 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) return hv_vp_assist_page[cpu]; } -/** - * hv_cpu_number_to_vp_number() - Map CPU to VP. - * @cpu_number: CPU number in Linux terms - * - * This function returns the mapping between the Linux processor - * number and the hypervisor's virtual processor number, useful - * in making hypercalls and such that talk about specific - * processors. - * - * Return: Virtual processor number in Hyper-V terms - */ -static inline int hv_cpu_number_to_vp_number(int cpu_number) -{ - return hv_vp_index[cpu_number]; -} - -static inline int cpumask_to_vpset(struct hv_vpset *vpset, - const struct cpumask *cpus) -{ - int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; - - /* valid_bank_mask can represent up to 64 banks */ - if (hv_max_vp_index / 64 >= 64) - return 0; - - /* - * Clear all banks up to the maximum possible bank as hv_tlb_flush_ex - * structs are not cleared between calls, we risk flushing unneeded - * vCPUs otherwise. - */ - for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) - vpset->bank_contents[vcpu_bank] = 0; - - /* - * Some banks may end up being empty but this is acceptable. - */ - for_each_cpu(cpu, cpus) { - vcpu = hv_cpu_number_to_vp_number(cpu); - if (vcpu == VP_INVAL) - return -1; - vcpu_bank = vcpu / 64; - vcpu_offset = vcpu % 64; - __set_bit(vcpu_offset, (unsigned long *) - &vpset->bank_contents[vcpu_bank]); - if (vcpu_bank >= nr_bank) - nr_bank = vcpu_bank + 1; - } - vpset->valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0); - return nr_bank; -} - void __init hyperv_init(void); void hyperv_setup_mmu_ops(void); -void hyperv_report_panic(struct pt_regs *regs, long err); -void hyperv_report_panic_msg(phys_addr_t pa, size_t size); -bool hv_is_hyperv_initialized(void); -void hyperv_cleanup(void); void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); @@ -369,8 +240,6 @@ static inline void hv_apic_init(void) {} #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} -static inline bool hv_is_hyperv_initialized(void) { return false; } -static inline void hyperv_cleanup(void) {} static inline void hyperv_setup_mmu_ops(void) {} static inline void set_hv_tscchange_cb(void (*cb)(void)) {} static inline void clear_hv_tscchange_cb(void) {} @@ -387,73 +256,7 @@ static inline int hyperv_flush_guest_mapping_range(u64 as, } #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_HYPERV_TSCPAGE -struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, - u64 *cur_tsc) -{ - u64 scale, offset; - u32 sequence; - - /* - * The protocol for reading Hyper-V TSC page is specified in Hypervisor - * Top-Level Functional Specification ver. 3.0 and above. To get the - * reference time we must do the following: - * - READ ReferenceTscSequence - * A special '0' value indicates the time source is unreliable and we - * need to use something else. The currently published specification - * versions (up to 4.0b) contain a mistake and wrongly claim '-1' - * instead of '0' as the special value, see commit c35b82ef0294. - * - ReferenceTime = - * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset - * - READ ReferenceTscSequence again. In case its value has changed - * since our first reading we need to discard ReferenceTime and repeat - * the whole sequence as the hypervisor was updating the page in - * between. - */ - do { - sequence = READ_ONCE(tsc_pg->tsc_sequence); - if (!sequence) - return U64_MAX; - /* - * Make sure we read sequence before we read other values from - * TSC page. - */ - smp_rmb(); - - scale = READ_ONCE(tsc_pg->tsc_scale); - offset = READ_ONCE(tsc_pg->tsc_offset); - *cur_tsc = rdtsc_ordered(); - - /* - * Make sure we read sequence after we read all other values - * from TSC page. - */ - smp_rmb(); - - } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - - return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; -} - -static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) -{ - u64 cur_tsc; - return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); -} +#include <asm-generic/mshyperv.h> -#else -static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) -{ - return NULL; -} - -static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, - u64 *cur_tsc) -{ - BUG(); - return U64_MAX; -} -#endif #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 979ef971cc78..6b4fc2788078 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,6 +61,15 @@ #define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 #define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) +#define MSR_IA32_UMWAIT_CONTROL 0xe1 +#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) +#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) + #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index eb0f80ce8524..e28f8b723b5c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -86,9 +86,9 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { - mds_idle_clear_cpu_buffers(); - trace_hardirqs_on(); + + mds_idle_clear_cpu_buffers(); /* "mwait %eax, %ecx;" */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 793c14c372cb..288b065955b7 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -48,7 +48,7 @@ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 52 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 2474e434a6f7..946f8f1f1efc 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -88,7 +88,7 @@ struct pv_init_ops { * the number of bytes of code generated, as we nop pad the * rest in generic code. */ - unsigned (*patch)(u8 type, void *insnbuf, + unsigned (*patch)(u8 type, void *insn_buff, unsigned long addr, unsigned len); } __no_randomize_layout; @@ -370,18 +370,11 @@ extern struct paravirt_patch_template pv_ops; /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" -#define DEF_NATIVE(ops, name, code) \ - __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name)) +unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len); +unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len); +unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_default(u8 type, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len); +unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len); int paravirt_disable_iospace(void); @@ -679,8 +672,8 @@ u64 _paravirt_ident_64(u64); /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ + u8 *instr; /* original instructions */ + u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ }; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 1a19d11cfbbd..2278797c769d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -87,7 +87,7 @@ * don't give an lvalue though). */ extern void __bad_percpu_size(void); -#define percpu_to_op(op, var, val) \ +#define percpu_to_op(qual, op, var, val) \ do { \ typedef typeof(var) pto_T__; \ if (0) { \ @@ -97,22 +97,22 @@ do { \ } \ switch (sizeof(var)) { \ case 1: \ - asm(op "b %1,"__percpu_arg(0) \ + asm qual (op "b %1,"__percpu_arg(0) \ : "+m" (var) \ : "qi" ((pto_T__)(val))); \ break; \ case 2: \ - asm(op "w %1,"__percpu_arg(0) \ + asm qual (op "w %1,"__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pto_T__)(val))); \ break; \ case 4: \ - asm(op "l %1,"__percpu_arg(0) \ + asm qual (op "l %1,"__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pto_T__)(val))); \ break; \ case 8: \ - asm(op "q %1,"__percpu_arg(0) \ + asm qual (op "q %1,"__percpu_arg(0) \ : "+m" (var) \ : "re" ((pto_T__)(val))); \ break; \ @@ -124,7 +124,7 @@ do { \ * Generate a percpu add to memory instruction and optimize code * if one is added or subtracted. */ -#define percpu_add_op(var, val) \ +#define percpu_add_op(qual, var, val) \ do { \ typedef typeof(var) pao_T__; \ const int pao_ID__ = (__builtin_constant_p(val) && \ @@ -138,41 +138,41 @@ do { \ switch (sizeof(var)) { \ case 1: \ if (pao_ID__ == 1) \ - asm("incb "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incb "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decb "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decb "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addb %1, "__percpu_arg(0) \ + asm qual ("addb %1, "__percpu_arg(0) \ : "+m" (var) \ : "qi" ((pao_T__)(val))); \ break; \ case 2: \ if (pao_ID__ == 1) \ - asm("incw "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incw "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decw "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decw "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addw %1, "__percpu_arg(0) \ + asm qual ("addw %1, "__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pao_T__)(val))); \ break; \ case 4: \ if (pao_ID__ == 1) \ - asm("incl "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incl "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decl "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decl "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addl %1, "__percpu_arg(0) \ + asm qual ("addl %1, "__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pao_T__)(val))); \ break; \ case 8: \ if (pao_ID__ == 1) \ - asm("incq "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incq "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decq "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decq "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addq %1, "__percpu_arg(0) \ + asm qual ("addq %1, "__percpu_arg(0) \ : "+m" (var) \ : "re" ((pao_T__)(val))); \ break; \ @@ -180,27 +180,27 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(qual, op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm volatile(op "b "__percpu_arg(1)",%0"\ + asm qual (op "b "__percpu_arg(1)",%0" \ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm volatile(op "w "__percpu_arg(1)",%0"\ + asm qual (op "w "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm volatile(op "l "__percpu_arg(1)",%0"\ + asm qual (op "l "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm volatile(op "q "__percpu_arg(1)",%0"\ + asm qual (op "q "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ @@ -238,23 +238,23 @@ do { \ pfo_ret__; \ }) -#define percpu_unary_op(op, var) \ +#define percpu_unary_op(qual, op, var) \ ({ \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(0) \ + asm qual (op "b "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(0) \ + asm qual (op "w "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(0) \ + asm qual (op "l "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(0) \ + asm qual (op "q "__percpu_arg(0) \ : "+m" (var)); \ break; \ default: __bad_percpu_size(); \ @@ -264,27 +264,27 @@ do { \ /* * Add return operation */ -#define percpu_add_return_op(var, val) \ +#define percpu_add_return_op(qual, var, val) \ ({ \ typeof(var) paro_ret__ = val; \ switch (sizeof(var)) { \ case 1: \ - asm("xaddb %0, "__percpu_arg(1) \ + asm qual ("xaddb %0, "__percpu_arg(1) \ : "+q" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 2: \ - asm("xaddw %0, "__percpu_arg(1) \ + asm qual ("xaddw %0, "__percpu_arg(1) \ : "+r" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 4: \ - asm("xaddl %0, "__percpu_arg(1) \ + asm qual ("xaddl %0, "__percpu_arg(1) \ : "+r" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 8: \ - asm("xaddq %0, "__percpu_arg(1) \ + asm qual ("xaddq %0, "__percpu_arg(1) \ : "+re" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ @@ -299,13 +299,13 @@ do { \ * expensive due to the implied lock prefix. The processor cannot prefetch * cachelines if xchg is used. */ -#define percpu_xchg_op(var, nval) \ +#define percpu_xchg_op(qual, var, nval) \ ({ \ typeof(var) pxo_ret__; \ typeof(var) pxo_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("\n\tmov "__percpu_arg(1)",%%al" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%al" \ "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -313,7 +313,7 @@ do { \ : "memory"); \ break; \ case 2: \ - asm("\n\tmov "__percpu_arg(1)",%%ax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%ax" \ "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -321,7 +321,7 @@ do { \ : "memory"); \ break; \ case 4: \ - asm("\n\tmov "__percpu_arg(1)",%%eax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%eax" \ "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -329,7 +329,7 @@ do { \ : "memory"); \ break; \ case 8: \ - asm("\n\tmov "__percpu_arg(1)",%%rax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%rax" \ "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -345,32 +345,32 @@ do { \ * cmpxchg has no such implied lock semantics as a result it is much * more efficient for cpu local operations. */ -#define percpu_cmpxchg_op(var, oval, nval) \ +#define percpu_cmpxchg_op(qual, var, oval, nval) \ ({ \ typeof(var) pco_ret__; \ typeof(var) pco_old__ = (oval); \ typeof(var) pco_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("cmpxchgb %2, "__percpu_arg(1) \ + asm qual ("cmpxchgb %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "q" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 2: \ - asm("cmpxchgw %2, "__percpu_arg(1) \ + asm qual ("cmpxchgw %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 4: \ - asm("cmpxchgl %2, "__percpu_arg(1) \ + asm qual ("cmpxchgl %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 8: \ - asm("cmpxchgq %2, "__percpu_arg(1) \ + asm qual ("cmpxchgq %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ @@ -391,58 +391,70 @@ do { \ */ #define this_cpu_read_stable(var) percpu_stable_op("mov", var) -#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) - -#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) - -#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) - -#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_read_1(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op(, "mov", pcp) + +#define raw_cpu_write_1(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op(, "or", (pcp), val) + +/* + * raw_cpu_xchg() can use a load-store since it is not required to be + * IRQ-safe. + */ +#define raw_percpu_xchg_op(var, nval) \ +({ \ + typeof(var) pxo_ret__ = raw_cpu_read(var); \ + raw_cpu_write(var, (nval)); \ + pxo_ret__; \ +}) + +#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) + +#define this_cpu_read_1(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(volatile, pcp, nval) + +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) + +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) #ifdef CONFIG_X86_CMPXCHG64 #define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ @@ -466,23 +478,23 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_read_8(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) /* * Pretty complex macro to generate cmpxchg16 instruction. The instruction diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index a281e61ec60c..29aa7859bdee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -6,6 +6,9 @@ #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> +#define __HAVE_ARCH_PTE_ALLOC_ONE +#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ + static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } #ifdef CONFIG_PARAVIRT_XXL @@ -47,24 +50,8 @@ extern gfp_t __userpte_alloc_gfp; extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -extern pte_t *pte_alloc_one_kernel(struct mm_struct *); extern pgtable_t pte_alloc_one(struct mm_struct *); -/* Should really implement gc for free page table pages. This could be - done with a reference count in struct page. */ - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, struct page *pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f8b1ad2c3828..e3633795fb22 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -285,53 +285,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) -#define gup_get_pte gup_get_pte -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a PTE will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' iff pte_high - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have gotten rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. Because get_user_pages_fast() only operates on present ptes - * we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} - #include <asm/pgtable-invert.h> #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 4fe9e7fc74d3..c78da8eda8f2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -106,6 +106,6 @@ do { \ * with only a host target support using a 32-bit type for internal * representation. */ -#define LOWMEM_PAGES ((((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)) +#define LOWMEM_PAGES ((((_ULL(2)<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)) #endif /* _ASM_X86_PGTABLE_32_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0bb566315621..4990d26dfc73 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -259,14 +259,8 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #define gup_fast_permitted gup_fast_permitted -static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long)nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; if (end >> __VIRTUAL_MASK_SHIFT) return false; return true; diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 88bca456da99..52e5f5f2240d 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -103,7 +103,7 @@ extern unsigned int ptrs_per_p4d; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* - * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * See Documentation/x86/x86_64/mm.rst for a description of the memory map. * * Be very careful vs. KASLR when changing anything here. The KASLR address * range must not overlap with anything except the KASAN shadow area, which diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c34a35c78618..6e0a3b43d027 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -105,7 +105,7 @@ struct cpuinfo_x86 { int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ - u16 x86_max_cores; + u16 x86_max_cores; u16 apicid; u16 initial_apicid; u16 x86_clflush_size; @@ -117,6 +117,8 @@ struct cpuinfo_x86 { u16 logical_proc_id; /* Core id: */ u16 cpu_core_id; + u16 cpu_die_id; + u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; @@ -144,7 +146,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 -#define X86_VENDOR_NUM 10 +#define X86_VENDOR_ZHAOXIN 10 +#define X86_VENDOR_NUM 11 #define X86_VENDOR_UNKNOWN 0xff @@ -738,6 +741,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cr4_init(void); static inline unsigned long get_debugctlmsr(void) { diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 8a7fc0cca2d1..78cf265c5b58 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -102,8 +102,7 @@ extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); -extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code); +extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); static inline unsigned long regs_return_value(struct pt_regs *regs) @@ -166,14 +165,10 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #define compat_user_stack_pointer() current_pt_regs()->sp #endif -#ifdef CONFIG_X86_32 -extern unsigned long kernel_stack_pointer(struct pt_regs *regs); -#else static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) @@ -201,14 +196,6 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (unlikely(offset > MAX_REG_OFFSET)) return 0; #ifdef CONFIG_X86_32 - /* - * Traps from the kernel do not save sp and ss. - * Use the helper function to retrieve sp. - */ - if (offset == offsetof(struct pt_regs, sp) && - regs->cs == __KERNEL_CS) - return kernel_stack_pointer(regs); - /* The selector fields are 16-bit. */ if (offset == offsetof(struct pt_regs, cs) || offset == offsetof(struct pt_regs, ss) || @@ -234,8 +221,7 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, static inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { - return ((addr & ~(THREAD_SIZE - 1)) == - (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); + return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1))); } /** @@ -249,7 +235,7 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs, */ static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) { - unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + unsigned long *addr = (unsigned long *)regs->sp; addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index b6033680d458..19b695ff2c68 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PVCLOCK_H #define _ASM_X86_PVCLOCK_H -#include <linux/clocksource.h> +#include <asm/clocksource.h> #include <asm/pvclock-abi.h> /* some helper functions for xen and kvm pv clock sources */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 8ea1cfdbeabc..71b32f2570ab 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -13,4 +13,6 @@ extern char __end_rodata_aligned[]; extern char __end_rodata_hpage_align[]; #endif +extern char __end_of_kernel_reserve[]; + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index da545df207b2..e1356a3b8223 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -23,6 +23,7 @@ extern unsigned int num_processors; DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); @@ -162,7 +163,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (this_cpu_read(cpu_number)) +#define raw_smp_processor_id() this_cpu_read(cpu_number) +#define __smp_processor_id() __this_cpu_read(cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 0a3c4cab39db..219be88a59d2 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include <asm/nops.h> +#include <asm/processor-flags.h> +#include <linux/jump_label.h> /* * Volatile isn't enough to prevent the compiler from reordering the @@ -16,6 +18,8 @@ */ extern unsigned long __force_order; +void native_write_cr0(unsigned long val); + static inline unsigned long native_read_cr0(void) { unsigned long val; @@ -23,11 +27,6 @@ static inline unsigned long native_read_cr0(void) return val; } -static inline void native_write_cr0(unsigned long val) -{ - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); -} - static inline unsigned long native_read_cr2(void) { unsigned long val; @@ -72,10 +71,7 @@ static inline unsigned long native_read_cr4(void) return val; } -static inline void native_write_cr4(unsigned long val) -{ - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); -} +void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_64 static inline unsigned long native_read_cr8(void) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a8d0cdf48616..14db05086bbf 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,7 +78,7 @@ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) - return (unsigned long *)kernel_stack_pointer(regs); + return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 880b5515b1d6..70c09967a999 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,6 +18,20 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif +/* + * Currently, the max observed size in the kernel code is + * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. + * Raise it if needed. + */ +#define POKE_MAX_OPCODE_SIZE 5 + +struct text_poke_loc { + void *detour; + void *addr; + size_t len; + const char opcode[POKE_MAX_OPCODE_SIZE]; +}; + extern void text_poke_early(void *addr, const void *opcode, size_t len); /* @@ -38,6 +52,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; @@ -51,7 +66,6 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) #define INT3_INSN_SIZE 1 #define CALL_INSN_SIZE 5 -#ifdef CONFIG_X86_64 static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* @@ -69,7 +83,6 @@ static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } -#endif /* CONFIG_X86_64 */ #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index cef818b16045..8ac563abb567 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -7,6 +7,7 @@ extern void hpet_time_init(void); extern void time_init(void); +extern bool pit_timer_init(void); extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 453cf38a1c33..4b14d2318251 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -106,15 +106,25 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) +#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) +#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #ifdef CONFIG_SMP +#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) +extern unsigned int __max_die_per_package; + +static inline int topology_max_die_per_package(void) +{ + return __max_die_per_package; +} + extern int __max_smt_threads; static inline int topology_max_smt_threads(void) @@ -123,14 +133,21 @@ static inline int topology_max_smt_threads(void) } int topology_update_package_map(unsigned int apicid, unsigned int cpu); +int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); +int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); bool topology_is_primary_thread(unsigned int cpu); bool topology_smt_supported(void); #else #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } +static inline int +topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } +static inline int topology_phys_to_logical_die(unsigned int die, + unsigned int cpu) { return 0; } +static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline bool topology_smt_supported(void) { return false; } diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 146859efd83c..097589753fec 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -54,5 +54,6 @@ # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE +# define __ARCH_WANT_SYS_CLONE3 #endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h new file mode 100644 index 000000000000..ae91429129a6 --- /dev/null +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * Copyright (C) 2019 ARM Limited. + * Copyright 2006 Andi Kleen, SUSE Labs. + * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + */ +#ifndef __ASM_VDSO_GETTIMEOFDAY_H +#define __ASM_VDSO_GETTIMEOFDAY_H + +#ifndef __ASSEMBLY__ + +#include <uapi/linux/time.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> +#include <asm/unistd.h> +#include <asm/msr.h> +#include <asm/pvclock.h> +#include <clocksource/hyperv_timer.h> + +#define __vdso_data (VVAR(_vdso_data)) + +#define VDSO_HAS_TIME 1 + +#define VDSO_HAS_CLOCK_GETRES 1 + +/* + * Declare the memory-mapped vclock data pages. These come from hypervisors. + * If we ever reintroduce something like direct access to an MMIO clock like + * the HPET again, it will go here as well. + * + * A load from any of these pages will segfault if the clock in question is + * disabled, so appropriate compiler barriers and checks need to be used + * to prevent stray loads. + * + * These declarations MUST NOT be const. The compiler will assume that + * an extern const variable has genuinely constant contents, and the + * resulting code won't work, since the whole point is that these pages + * change over time, possibly while we're accessing them. + */ + +#ifdef CONFIG_PARAVIRT_CLOCK +/* + * This is the vCPU 0 pvclock page. We only use pvclock from the vDSO + * if the hypervisor tells us that all vCPUs can get valid data from the + * vCPU 0 page. + */ +extern struct pvclock_vsyscall_time_info pvclock_page + __attribute__((visibility("hidden"))); +#endif + +#ifdef CONFIG_HYPERV_TSCPAGE +extern struct ms_hyperv_tsc_page hvclock_page + __attribute__((visibility("hidden"))); +#endif + +#ifndef BUILD_VDSO32 + +static __always_inline +long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ("syscall" : "=a" (ret), "=m" (*_ts) : + "0" (__NR_clock_gettime), "D" (_clkid), "S" (_ts) : + "rcx", "r11"); + + return ret; +} + +static __always_inline +long gettimeofday_fallback(struct __kernel_old_timeval *_tv, + struct timezone *_tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (_tv), "S" (_tz) : "memory"); + + return ret; +} + +static __always_inline +long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ("syscall" : "=a" (ret), "=m" (*_ts) : + "0" (__NR_clock_getres), "D" (_clkid), "S" (_ts) : + "rcx", "r11"); + + return ret; +} + +#else + +static __always_inline +long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*_ts) + : "0" (__NR_clock_gettime64), [clock] "g" (_clkid), "c" (_ts) + : "edx"); + + return ret; +} + +static __always_inline +long gettimeofday_fallback(struct __kernel_old_timeval *_tv, + struct timezone *_tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (_tv), "c" (_tz) + : "memory", "edx"); + + return ret; +} + +static __always_inline long +clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*_ts) + : "0" (__NR_clock_getres_time64), [clock] "g" (_clkid), "c" (_ts) + : "edx"); + + return ret; +} + +#endif + +#ifdef CONFIG_PARAVIRT_CLOCK +static u64 vread_pvclock(void) +{ + const struct pvclock_vcpu_time_info *pvti = &pvclock_page.pvti; + u32 version; + u64 ret; + + /* + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. + * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. + * + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. + */ + + do { + version = pvclock_read_begin(pvti); + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) + return U64_MAX; + + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); + } while (pvclock_read_retry(pvti, version)); + + return ret; +} +#endif + +#ifdef CONFIG_HYPERV_TSCPAGE +static u64 vread_hvclock(void) +{ + return hv_read_tsc_page(&hvclock_page); +} +#endif + +static inline u64 __arch_get_hw_counter(s32 clock_mode) +{ + if (clock_mode == VCLOCK_TSC) + return (u64)rdtsc_ordered(); + /* + * For any memory-mapped vclock type, we need to make sure that gcc + * doesn't cleverly hoist a load before the mode check. Otherwise we + * might end up touching the memory-mapped page even if the vclock in + * question isn't enabled, which will segfault. Hence the barriers. + */ +#ifdef CONFIG_PARAVIRT_CLOCK + if (clock_mode == VCLOCK_PVCLOCK) { + barrier(); + return vread_pvclock(); + } +#endif +#ifdef CONFIG_HYPERV_TSCPAGE + if (clock_mode == VCLOCK_HVCLOCK) { + barrier(); + return vread_hvclock(); + } +#endif + return U64_MAX; +} + +static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +{ + return __vdso_data; +} + +/* + * x86 specific delta calculation. + * + * The regular implementation assumes that clocksource reads are globally + * monotonic. The TSC can be slightly off across sockets which can cause + * the regular delta calculation (@cycles - @last) to return a huge time + * jump. + * + * Therefore it needs to be verified that @cycles are greater than + * @last. If not then use @last, which is the base time of the current + * conversion period. + * + * This variant also removes the masking of the subtraction because the + * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX + * which would result in a pointless operation. The compiler cannot + * optimize it away as the mask comes from the vdso data and is not compile + * time constant. + */ +static __always_inline +u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +{ + if (cycles > last) + return (cycles - last) * mult; + return 0; +} +#define vdso_calc_delta vdso_calc_delta + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h new file mode 100644 index 000000000000..0026ab2123ce --- /dev/null +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_VSYSCALL_H +#define __ASM_VDSO_VSYSCALL_H + +#ifndef __ASSEMBLY__ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +int vclocks_used __read_mostly; + +DEFINE_VVAR(struct vdso_data, _vdso_data); +/* + * Update the vDSO data page to keep in sync with kernel timekeeping. + */ +static __always_inline +struct vdso_data *__x86_get_k_vdso_data(void) +{ + return _vdso_data; +} +#define __arch_get_k_vdso_data __x86_get_k_vdso_data + +static __always_inline +int __x86_get_clock_mode(struct timekeeper *tk) +{ + int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + + /* Mark the new vclock used. */ + BUILD_BUG_ON(VCLOCK_MAX >= 32); + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); + + return vclock_mode; +} +#define __arch_get_clock_mode __x86_get_clock_mode + +/* The asm-generic header needs to be included after the definitions above */ +#include <asm-generic/vdso/vsyscall.h> + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 913a133f8e6f..a2638c6124ed 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -3,7 +3,9 @@ #define _ASM_X86_VGTOD_H #include <linux/compiler.h> -#include <linux/clocksource.h> +#include <asm/clocksource.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> #include <uapi/linux/time.h> @@ -13,81 +15,10 @@ typedef u64 gtod_long_t; typedef unsigned long gtod_long_t; #endif -/* - * There is one of these objects in the vvar page for each - * vDSO-accelerated clockid. For high-resolution clocks, this encodes - * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse - * clocks, this encodes the actual time. - * - * To confuse the reader, for high-resolution clocks, nsec is left-shifted - * by vsyscall_gtod_data.shift. - */ -struct vgtod_ts { - u64 sec; - u64 nsec; -}; - -#define VGTOD_BASES (CLOCK_TAI + 1) -#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI)) -#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE)) - -/* - * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time - * so be carefull by modifying this structure. - */ -struct vsyscall_gtod_data { - unsigned int seq; - - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - - struct vgtod_ts basetime[VGTOD_BASES]; - - int tz_minuteswest; - int tz_dsttime; -}; -extern struct vsyscall_gtod_data vsyscall_gtod_data; - extern int vclocks_used; static inline bool vclock_was_used(int vclock) { return READ_ONCE(vclocks_used) & (1 << vclock); } -static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s) -{ - unsigned int ret; - -repeat: - ret = READ_ONCE(s->seq); - if (unlikely(ret & 1)) { - cpu_relax(); - goto repeat; - } - smp_rmb(); - return ret; -} - -static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, - unsigned int start) -{ - smp_rmb(); - return unlikely(s->seq != start); -} - -static inline void gtod_write_begin(struct vsyscall_gtod_data *s) -{ - ++s->seq; - smp_wmb(); -} - -static inline void gtod_write_end(struct vsyscall_gtod_data *s) -{ - smp_wmb(); - ++s->seq; -} - #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index b986b2ca688a..ab60a71a8dcb 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root); * Called on instruction fetch fault in vsyscall page. * Returns true if handled. */ -extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +extern bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address); #else static inline void map_vsyscall(void) {} -static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +static inline bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { return false; } diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index e474f5c6e387..32f5d9a0b90e 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -32,19 +32,20 @@ extern char __vvar_page; #define DECLARE_VVAR(offset, type, name) \ - extern type vvar_ ## name __attribute__((visibility("hidden"))); + extern type vvar_ ## name[CS_BASES] \ + __attribute__((visibility("hidden"))); #define VVAR(name) (vvar_ ## name) #define DEFINE_VVAR(type, name) \ - type name \ + type name[CS_BASES] \ __attribute__((section(".vvar_" #name), aligned(16))) __visible #endif /* DECLARE_VVAR(offset, type, name) */ -DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) +DECLARE_VVAR(128, struct vdso_data, _vdso_data) #undef DECLARE_VVAR diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 60733f137e9a..c895df5482c5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -29,6 +29,8 @@ #define XLF_EFI_HANDOVER_32 (1<<2) #define XLF_EFI_HANDOVER_64 (1<<3) #define XLF_EFI_KEXEC (1<<4) +#define XLF_5LEVEL (1<<5) +#define XLF_5LEVEL_ENABLED (1<<6) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d6ab5b4d15e5..e901b0ab116f 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,10 +378,11 @@ struct kvm_sync_regs { struct kvm_vcpu_events events; }; -#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) -#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) -#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */ @@ -432,4 +433,14 @@ struct kvm_nested_state { } data; }; +/* for KVM_CAP_PMU_EVENT_FILTER */ +struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u64 events[0]; +}; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 19980ec1a316..2a8e0b6b9805 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -29,6 +29,8 @@ #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 #define KVM_FEATURE_PV_SEND_IPI 11 +#define KVM_FEATURE_POLL_CONTROL 12 +#define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_HINTS_REALTIME 0 @@ -47,6 +49,7 @@ #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 #define MSR_KVM_PV_EOI_EN 0x4b564d04 +#define MSR_KVM_POLL_CONTROL 0x4b564d05 struct kvm_steal_time { __u64 steal; diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d213ec5c3766..f0b0c90dd398 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -146,7 +146,6 @@ #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 -#define VMX_ABORT_VMCS_CORRUPTED 3 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ce1b5cc360a2..3578ad248bc9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -30,7 +30,7 @@ KASAN_SANITIZE_paravirt.o := n OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y ifdef CONFIG_FRAME_POINTER OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y @@ -112,7 +112,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index a5e5484988fd..caf2edccbad2 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, c->x86_stepping >= 0x0e)) flags->bm_check = 1; } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All Zhaoxin CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + /* + * On all recent Zhaoxin platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state. + */ + flags->bm_control = 0; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 390596b761e3..ccd32013c47a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -14,6 +14,7 @@ #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/mmu_context.h> +#include <linux/bsearch.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -277,7 +278,7 @@ static inline bool is_jmp(const u8 opcode) } static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) { u8 *next_rip, *tgt_rip; s32 n_dspl, o_dspl; @@ -286,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) if (a->replacementlen != 5) return; - o_dspl = *(s32 *)(insnbuf + 1); + o_dspl = *(s32 *)(insn_buff + 1); /* next_rip of the replacement JMP */ next_rip = repl_insn + a->replacementlen; @@ -312,9 +313,9 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) two_byte_jmp: n_dspl -= 2; - insnbuf[0] = 0xeb; - insnbuf[1] = (s8)n_dspl; - add_nops(insnbuf + 2, 3); + insn_buff[0] = 0xeb; + insn_buff[1] = (s8)n_dspl; + add_nops(insn_buff + 2, 3); repl_len = 2; goto done; @@ -322,8 +323,8 @@ two_byte_jmp: five_byte_jmp: n_dspl -= 5; - insnbuf[0] = 0xe9; - *(s32 *)&insnbuf[1] = n_dspl; + insn_buff[0] = 0xe9; + *(s32 *)&insn_buff[1] = n_dspl; repl_len = 5; @@ -370,7 +371,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, { struct alt_instr *a; u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; + u8 insn_buff[MAX_PATCH_LEN]; DPRINTK("alt table %px, -> %px", start, end); /* @@ -383,11 +384,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - int insnbuf_sz = 0; + int insn_buff_sz = 0; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); if (!boot_cpu_has(a->cpuid)) { if (a->padlen > 1) @@ -405,8 +406,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; + memcpy(insn_buff, replacement, a->replacementlen); + insn_buff_sz = a->replacementlen; /* * 0xe8 is a relative jump; fix the offset. @@ -414,24 +415,24 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * Instruction length is checked before the opcode to avoid * accessing uninitialized bytes for zero-length replacements. */ - if (a->replacementlen == 5 && *insnbuf == 0xe8) { - *(s32 *)(insnbuf + 1) += replacement - instr; + if (a->replacementlen == 5 && *insn_buff == 0xe8) { + *(s32 *)(insn_buff + 1) += replacement - instr; DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insnbuf + 1), - (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + *(s32 *)(insn_buff + 1), + (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); } if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insnbuf); + recompute_jump(a, instr, replacement, insn_buff); if (a->instrlen > a->replacementlen) { - add_nops(insnbuf + a->replacementlen, + add_nops(insn_buff + a->replacementlen, a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; + insn_buff_sz += a->instrlen - a->replacementlen; } - DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insnbuf, insnbuf_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } } @@ -593,33 +594,119 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) { struct paravirt_patch_site *p; - char insnbuf[MAX_PATCH_LEN]; + char insn_buff[MAX_PATCH_LEN]; for (p = start; p < end; p++) { unsigned int used; BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ - memcpy(insnbuf, p->instr, p->len); - used = pv_ops.init.patch(p->instrtype, insnbuf, - (unsigned long)p->instr, p->len); + memcpy(insn_buff, p->instr, p->len); + used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); /* Pad the rest with nops */ - add_nops(insnbuf + used, p->len - used); - text_poke_early(p->instr, insnbuf, p->len); + add_nops(insn_buff + used, p->len - used); + text_poke_early(p->instr, insn_buff, p->len); } } extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ +/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ + +/* + * We define the int3_magic() function in assembly to control the calling + * convention such that we can 'call' it from assembly. + */ + +extern void int3_magic(unsigned int *ptr); /* defined in asm */ + +asm ( +" .pushsection .init.text, \"ax\", @progbits\n" +" .type int3_magic, @function\n" +"int3_magic:\n" +" movl $1, (%" _ASM_ARG1 ")\n" +" ret\n" +" .size int3_magic, .-int3_magic\n" +" .popsection\n" +); + +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +static void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * Stick the address of the INT3 instruction into int3_selftest_ip, + * then trigger the INT3, padded with NOPs to match a CALL instruction + * length. + */ + asm volatile ("1: int3; nop; nop; nop; nop\n\t" + ".pushsection .init.data,\"aw\"\n\t" + ".align " __ASM_SEL(4, 8) "\n\t" + ".type int3_selftest_ip, @object\n\t" + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" + "int3_selftest_ip:\n\t" + __ASM_SEL(.long, .quad) " 1b\n\t" + ".popsection\n\t" + : ASM_CALL_CONSTRAINT + : __ASM_SEL_RAW(a, D) (&val) + : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + void __init alternative_instructions(void) { - /* The patching is not fully atomic, so try to avoid local interruptions - that might execute the to be patched code. - Other CPUs are not running. */ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ stop_nmi(); /* @@ -644,10 +731,11 @@ void __init alternative_instructions(void) _text, _etext); } - if (!uniproc_patched || num_possible_cpus() == 1) + if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); + } #endif apply_paravirt(__parainstructions, __parainstructions_end); @@ -848,81 +936,133 @@ static void do_sync_core(void *info) sync_core(); } -static bool bp_patching_in_progress; -static void *bp_int3_handler, *bp_int3_addr; +static struct bp_patching_desc { + struct text_poke_loc *vec; + int nr_entries; +} bp_patching; + +static int patch_cmp(const void *key, const void *elt) +{ + struct text_poke_loc *tp = (struct text_poke_loc *) elt; + + if (key < tp->addr) + return -1; + if (key > tp->addr) + return 1; + return 0; +} +NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { + struct text_poke_loc *tp; + unsigned char int3 = 0xcc; + void *ip; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching_in_progress. + * bp_patching.nr_entries. * - * in_progress = TRUE INT3 + * nr_entries != 0 INT3 * WMB RMB - * write INT3 if (in_progress) + * write INT3 if (nr_entries) * - * Idem for bp_int3_handler. + * Idem for other elements in bp_patching. */ smp_rmb(); - if (likely(!bp_patching_in_progress)) + if (likely(!bp_patching.nr_entries)) return 0; - if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) + if (user_mode(regs)) return 0; - /* set up the specified breakpoint handler */ - regs->ip = (unsigned long) bp_int3_handler; + /* + * Discount the sizeof(int3). See text_poke_bp_batch(). + */ + ip = (void *) regs->ip - sizeof(int3); + + /* + * Skip the binary search if there is a single member in the vector. + */ + if (unlikely(bp_patching.nr_entries > 1)) { + tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); + if (!tp) + return 0; + } else { + tp = bp_patching.vec; + if (tp->addr != ip) + return 0; + } + + /* set up the specified breakpoint detour */ + regs->ip = (unsigned long) tp->detour; return 1; } NOKPROBE_SYMBOL(poke_int3_handler); /** - * text_poke_bp() -- update instructions on live kernel on SMP - * @addr: address to patch - * @opcode: opcode of new instruction - * @len: length to copy - * @handler: address to jump to when the temporary breakpoint is hit + * text_poke_bp_batch() -- update instructions on live kernel on SMP + * @tp: vector of instructions to patch + * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: - * - add a int3 trap to the address that will be patched + * - For each entry in the vector: + * - add a int3 trap to the address that will be patched * - sync cores - * - update all but the first byte of the patched range + * - For each entry in the vector: + * - update all but the first byte of the patched range * - sync cores - * - replace the first byte (int3) by the first byte of - * replacing opcode + * - For each entry in the vector: + * - replace the first byte (int3) by the first byte of + * replacing opcode * - sync cores */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { + int patched_all_but_first = 0; unsigned char int3 = 0xcc; - - bp_int3_handler = handler; - bp_int3_addr = (u8 *)addr + sizeof(int3); - bp_patching_in_progress = true; + unsigned int i; lockdep_assert_held(&text_mutex); + bp_patching.vec = tp; + bp_patching.nr_entries = nr_entries; + /* * Corresponding read barrier in int3 notifier for making sure the - * in_progress and handler are correctly ordered wrt. patching. + * nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); - text_poke(addr, &int3, sizeof(int3)); + /* + * First step: add a int3 trap to the address that will be patched. + */ + for (i = 0; i < nr_entries; i++) + text_poke(tp[i].addr, &int3, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - if (len - sizeof(int3) > 0) { - /* patch all but the first byte */ - text_poke((char *)addr + sizeof(int3), - (const char *) opcode + sizeof(int3), - len - sizeof(int3)); + /* + * Second step: update all but the first byte of the patched range. + */ + for (i = 0; i < nr_entries; i++) { + if (tp[i].len - sizeof(int3) > 0) { + text_poke((char *)tp[i].addr + sizeof(int3), + (const char *)tp[i].opcode + sizeof(int3), + tp[i].len - sizeof(int3)); + patched_all_but_first++; + } + } + + if (patched_all_but_first) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -931,14 +1071,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) on_each_cpu(do_sync_core, NULL, 1); } - /* patch the first byte */ - text_poke(addr, opcode, sizeof(int3)); + /* + * Third step: replace the first byte (int3) by the first byte of + * replacing opcode. + */ + for (i = 0; i < nr_entries; i++) + text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. */ - bp_patching_in_progress = false; + bp_patching.vec = NULL; + bp_patching.nr_entries = 0; } +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit + * + * Update a single instruction with the vector in the stack, avoiding + * dynamically allocated memory. This function should be used when it is + * not possible to allocate memory. + */ +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +{ + struct text_poke_loc tp = { + .detour = handler, + .addr = addr, + .len = len, + }; + + if (len > POKE_MAX_OPCODE_SIZE) { + WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); + return; + } + + memcpy((void *)tp.opcode, opcode, len); + + text_poke_bp_batch(&tp, 1); +} diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 002aedc69393..d63e63b7d1d9 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -72,7 +72,7 @@ static const struct pci_device_id hygon_root_ids[] = { {} }; -const struct pci_device_id hygon_nb_misc_ids[] = { +static const struct pci_device_id hygon_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 85be316665b4..1bd91cb7b320 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -195,7 +195,7 @@ static struct resource lapic_resource = { .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; -unsigned int lapic_timer_frequency = 0; +unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); @@ -501,7 +501,7 @@ lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot) if (evt->features & CLOCK_EVT_FEAT_DUMMY) return 0; - __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1); + __setup_APIC_LVTT(lapic_timer_period, oneshot, 1); return 0; } @@ -805,11 +805,11 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) static int __init lapic_init_clockevent(void) { - if (!lapic_timer_frequency) + if (!lapic_timer_period) return -1; /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR, TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); @@ -821,6 +821,33 @@ static int __init lapic_init_clockevent(void) return 0; } +bool __init apic_needs_pit(void) +{ + /* + * If the frequencies are not known, PIT is required for both TSC + * and apic timer calibration. + */ + if (!tsc_khz || !cpu_khz) + return true; + + /* Is there an APIC at all? */ + if (!boot_cpu_has(X86_FEATURE_APIC)) + return true; + + /* Deadline timer is based on TSC so no further PIT action required */ + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + + /* APIC timer disabled? */ + if (disable_apic_timer) + return true; + /* + * The APIC timer frequency is known already, no PIT calibration + * required. If unknown, let the PIT be initialized. + */ + return lapic_timer_period == 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); @@ -839,7 +866,7 @@ static int __init calibrate_APIC_clock(void) */ if (!lapic_init_clockevent()) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_frequency); + lapic_timer_period); /* * Direct calibration methods must have an always running * local APIC timer, no need for broadcast timer. @@ -884,13 +911,13 @@ static int __init calibrate_APIC_clock(void) pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, &delta, &deltatsc); - lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; lapic_init_clockevent(); apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - lapic_timer_frequency); + lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " @@ -901,13 +928,13 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... host bus clock speed is " "%u.%04u MHz.\n", - lapic_timer_frequency / (1000000 / HZ), - lapic_timer_frequency % (1000000 / HZ)); + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result */ - if (lapic_timer_frequency < (1000000 / HZ)) { + if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; @@ -1351,6 +1378,8 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } +static void __init apic_bsp_setup(bool upmode); + /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { @@ -2041,21 +2070,32 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) entering_irq(); trace_spurious_apic_entry(vector); + inc_irq_stat(irq_spurious_count); + + /* + * If this is a spurious interrupt then do not acknowledge + */ + if (vector == SPURIOUS_APIC_VECTOR) { + /* See SDM vol 3 */ + pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n", + smp_processor_id()); + goto out; + } + /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. + * If it is a vectored one, verify it's set in the ISR. If set, + * acknowledge it. */ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); - if (v & (1 << (vector & 0x1f))) + if (v & (1 << (vector & 0x1f))) { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", + vector, smp_processor_id()); ack_APIC_irq(); - - inc_irq_stat(irq_spurious_count); - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " - "should never happen.\n", vector, smp_processor_id()); - + } else { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", + vector, smp_processor_id()); + } +out: trace_spurious_apic_exit(vector); exiting_irq(); } @@ -2416,11 +2456,8 @@ static void __init apic_bsp_up_setup(void) /** * apic_bsp_setup - Setup function for local apic and io-apic * @upmode: Force UP mode (for APIC_init_uniprocessor) - * - * Returns: - * apic_id of BSP APIC */ -void __init apic_bsp_setup(bool upmode) +static void __init apic_bsp_setup(bool upmode) { connect_bsp_APIC(); if (upmode) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bf083c3f1d73..bbdca603f94a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -78,7 +78,7 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) int cpu = smp_processor_id(); if (cpu < BITS_PER_LONG) - clear_bit(cpu, &mask); + __clear_bit(cpu, &mask); _flat_send_IPI_mask(mask, vector); } @@ -92,7 +92,7 @@ static void flat_send_IPI_allbutself(int vector) unsigned long mask = cpumask_bits(cpu_online_mask)[0]; if (cpu < BITS_PER_LONG) - clear_bit(cpu, &mask); + __clear_bit(cpu, &mask); _flat_send_IPI_mask(mask, vector); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53aa234a6803..c7bb6c69f21c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -58,6 +58,7 @@ #include <asm/acpi.h> #include <asm/dma.h> #include <asm/timer.h> +#include <asm/time.h> #include <asm/i8259.h> #include <asm/setup.h> #include <asm/irq_remapping.h> @@ -1893,6 +1894,50 @@ static int ioapic_set_affinity(struct irq_data *irq_data, return ret; } +/* + * Interrupt shutdown masks the ioapic pin, but the interrupt might already + * be in flight, but not yet serviced by the target CPU. That means + * __synchronize_hardirq() would return and claim that everything is calmed + * down. So free_irq() would proceed and deactivate the interrupt and free + * resources. + * + * Once the target CPU comes around to service it it will find a cleared + * vector and complain. While the spurious interrupt is harmless, the full + * release of resources might prevent the interrupt from being acknowledged + * which keeps the hardware in a weird state. + * + * Verify that the corresponding Remote-IRR bits are clear. + */ +static int ioapic_irq_get_chip_state(struct irq_data *irqd, + enum irqchip_irq_state which, + bool *state) +{ + struct mp_chip_data *mcd = irqd->chip_data; + struct IO_APIC_route_entry rentry; + struct irq_pin_list *p; + + if (which != IRQCHIP_STATE_ACTIVE) + return -EINVAL; + + *state = false; + raw_spin_lock(&ioapic_lock); + for_each_irq_pin(p, mcd->irq_2_pin) { + rentry = __ioapic_read_entry(p->apic, p->pin); + /* + * The remote IRR is only valid in level trigger mode. It's + * meaning is undefined for edge triggered interrupts and + * irrelevant because the IO-APIC treats them as fire and + * forget. + */ + if (rentry.irr && rentry.trigger) { + *state = true; + break; + } + } + raw_spin_unlock(&ioapic_lock); + return 0; +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, @@ -1902,6 +1947,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_eoi = ioapic_ack_level, .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -1914,6 +1960,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_eoi = ioapic_ir_ack_level, .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2083,6 +2130,9 @@ static inline void __init check_timer(void) unsigned long flags; int no_pin1 = 0; + if (!global_clock_event) + return; + local_irq_save(flags); /* diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index dad0dd759de2..7f7533462474 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -370,14 +370,14 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) return d; } -int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, +int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, int dev_num) { struct irq_alloc_info info; init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_data = dev; + info.hpet_data = hc; info.hpet_id = hpet_dev_id(domain); info.hpet_index = dev_num; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index e7cb78aed644..fdacb864c3dd 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -340,7 +340,7 @@ static void clear_irq_vector(struct irq_data *irqd) trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, apicd->prev_cpu); - per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); apicd->vector = 0; @@ -349,7 +349,7 @@ static void clear_irq_vector(struct irq_data *irqd) if (!vector) return; - per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 7685444a106b..609e499387a1 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -50,7 +50,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) cpumask_copy(tmpmsk, mask); /* If IPI should not be sent to self, clear current CPU */ if (apic_dest != APIC_DEST_ALLINC) - cpumask_clear_cpu(smp_processor_id(), tmpmsk); + __cpumask_clear_cpu(smp_processor_id(), tmpmsk); /* Collapse cpus in a cluster so a single IPI per cluster is sent */ for_each_cpu(cpu, tmpmsk) { diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 168543d077d7..da64452584b0 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -38,7 +38,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(TASK_TI_flags, task_struct, thread_info.flags); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5102bf7c8192..d7a1e5a9331c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,6 +24,7 @@ obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o +obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o @@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ @@ -47,6 +49,7 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_ACRN_GUEST) += acrn.o ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ @@ -54,8 +57,7 @@ quiet_cmd_mkcapflags = MKCAP $@ cpufeature = $(src)/../../include/asm/cpufeatures.h -targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) endif -clean-files += capflags.c +targets += capflags.c diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c new file mode 100644 index 000000000000..676022e71791 --- /dev/null +++ b/arch/x86/kernel/cpu/acrn.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN detection support + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + * + * Jason Chen CJ <jason.cj.chen@intel.com> + * Zhao Yakui <yakui.zhao@intel.com> + * + */ + +#include <linux/interrupt.h> +#include <asm/acrn.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/hypervisor.h> +#include <asm/irq_regs.h> + +static uint32_t __init acrn_detect(void) +{ + return hypervisor_cpuid_base("ACRNACRNACRN\0\0", 0); +} + +static void __init acrn_init_platform(void) +{ + /* Setup the IDT for ACRN hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); +} + +static bool acrn_x2apic_available(void) +{ + /* + * x2apic is not supported for now. Future enablement will have to check + * X86_FEATURE_X2APIC to determine whether x2apic is supported in the + * guest. + */ + return false; +} + +static void (*acrn_intr_handler)(void); + +__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * The hypervisor requires that the APIC EOI should be acked. + * If the APIC EOI is not acked, the APIC ISR bit for the + * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it + * will block the interrupt whose vector is lower than + * HYPERVISOR_CALLBACK_VECTOR. + */ + entering_ack_irq(); + inc_irq_stat(irq_hv_callback_count); + + if (acrn_intr_handler) + acrn_intr_handler(); + + exiting_irq(); + set_irq_regs(old_regs); +} + +const __initconst struct hypervisor_x86 x86_hyper_acrn = { + .name = "ACRN", + .detect = acrn_detect, + .type = X86_HYPER_ACRN, + .init.init_platform = acrn_init_platform, + .init.x2apic_available = acrn_x2apic_available, +}; diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e71a6ff8a67e..e2f319dc992d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/cpufreq.h> #include <linux/smp.h> +#include <linux/sched/isolation.h> #include "cpu.h" @@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); return per_cpu(samples.khz, cpu); } @@ -101,9 +105,12 @@ void arch_freq_prepare_all(void) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + continue; if (!aperfmperf_snapshot_cpu(cpu, now, false)) wait = true; + } if (wait) msleep(APERFMPERF_REFRESH_DELAY_MS); @@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) return per_cpu(samples.khz, cpu); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 395d46f78582..c7503be92f35 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) if (c->x86 < 0x17) { /* LLC is at the node level. */ per_cpu(cpu_llc_id, cpu) = node_id; - } else if (c->x86 == 0x17 && - c->x86_model >= 0 && c->x86_model <= 0x1F) { + } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2c57fffebf9b..11472178e17f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,6 +366,77 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +static unsigned long cr4_pinned_bits __ro_after_init; + +void native_write_cr0(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } +} +EXPORT_SYMBOL(native_write_cr0); + +void native_write_cr4(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { + bits_missing = ~val & cr4_pinned_bits; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", + bits_missing); + } +} +EXPORT_SYMBOL(native_write_cr4); + +void cr4_init(void) +{ + unsigned long cr4 = __read_cr4(); + + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 |= cr4_pinned_bits; + + __write_cr4(cr4); + + /* Initialize cr4 shadow for this CPU. */ + this_cpu_write(cpu_tlbstate.cr4, cr4); +} + +/* + * Once CPU feature detection is finished (and boot params have been + * parsed), record any of the sensitive CR bits that are set, and + * enable CR pinning. + */ +static void __init setup_cr_pinning(void) +{ + unsigned long mask; + + mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + static_key_enable(&cr_pinning.key); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -801,6 +872,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } +static void init_cqm(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -823,6 +918,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; c->x86_capability[CPUID_7_EDX] = edx; + + /* Check valid sub-leaf index before accessing it */ + if (eax >= 1) { + cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_1_EAX] = eax; + } } /* Extended state features: level 0x0000000d */ @@ -832,33 +933,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* Additional Intel-defined flags: level 0x0000000F */ - if (c->cpuid_level >= 0x0000000F) { - - /* QoS sub-leaf, EAX=0Fh, ECX=0 */ - cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_0_EDX] = edx; - - if (cpu_has(c, X86_FEATURE_CQM_LLC)) { - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = ebx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_1_EDX] = edx; - - if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || - ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || - (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } - } else { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - } - } - /* AMD-defined flags: level 0x80000001 */ eax = cpuid_eax(0x80000000); c->extended_cpuid_level = eax; @@ -889,6 +963,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); + init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -1299,6 +1374,7 @@ static void validate_apic_and_package_id(struct cpuinfo_x86 *c) cpu, apicid, c->initial_apicid); } BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); + BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); #else c->logical_proc_id = 0; #endif @@ -1464,6 +1540,7 @@ void __init identify_boot_cpu(void) enable_sep_cpu(); #endif cpu_detect_tlb(&boot_cpu_data); + setup_cr_pinning(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1698,12 +1775,6 @@ void cpu_init(void) wait_for_master_cpu(cpu); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); - if (cpu) load_ucode_ap(); @@ -1798,12 +1869,6 @@ void cpu_init(void) wait_for_master_cpu(cpu); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); - show_ucode_info_early(); pr_info("Initializing CPU#%d\n", cpu); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 2c0bd38a44ab..b5353244749b 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -20,6 +20,7 @@ struct cpuid_dep { * but it's difficult to tell that to the init reference checker. */ static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, @@ -27,7 +28,11 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, { X86_FEATURE_XMM, X86_FEATURE_FXSR }, { X86_FEATURE_XMM2, X86_FEATURE_XMM }, { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, @@ -59,6 +64,10 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 479ca4728de0..87e39ad8d873 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -32,6 +32,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; +extern const struct hypervisor_x86 x86_hyper_acrn; static const __initconst struct hypervisor_x86 * const hypervisors[] = { @@ -49,6 +50,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_JAILHOUSE_GUEST &x86_hyper_jailhouse, #endif +#ifdef CONFIG_ACRN_GUEST + &x86_hyper_acrn, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f17c1a714779..8d6d92ebeb54 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) @@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + check_memory_type_self_snoop_errata(c); /* * Get the number of SMT siblings early from the extended topology diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 785050af85e5..6ea7fdc82f3c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -99,11 +99,6 @@ static struct smca_bank_name smca_names[] = { [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, }; -static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = -{ - [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } -}; - static const char *smca_get_name(enum smca_bank_types t) { if (t >= N_SMCA_BANK_TYPES) @@ -197,6 +192,9 @@ static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ +/* Map of banks that have more than MCA_MISC0 available. */ +static DEFINE_PER_CPU(u32, smca_misc_banks_map); + static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -206,6 +204,28 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; +static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) +{ + u32 low, high; + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return; + + if (!(low & MCI_CONFIG_MCAX)) + return; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) + return; + + if (low & MASK_BLKPTR_LO) + per_cpu(smca_misc_banks_map, cpu) |= BIT(bank); + +} + static void smca_configure(unsigned int bank, unsigned int cpu) { unsigned int i, hwid_mcatype; @@ -243,6 +263,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } + smca_set_misc_banks_map(bank, cpu); + /* Return early if this bank was already initialized. */ if (smca_banks[bank].hwid) return; @@ -453,50 +475,29 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block) +static u32 smca_get_block_address(unsigned int bank, unsigned int block, + unsigned int cpu) { - u32 low, high; - u32 addr = 0; - - if (smca_get_bank_type(bank) == SMCA_RESERVED) - return addr; - if (!block) return MSR_AMD64_SMCA_MCx_MISC(bank); - /* Check our cache first: */ - if (smca_bank_addrs[bank][block] != -1) - return smca_bank_addrs[bank][block]; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - goto out; - - if (!(low & MCI_CONFIG_MCAX)) - goto out; - - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && - (low & MASK_BLKPTR_LO)) - addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank))) + return 0; -out: - smca_bank_addrs[bank][block] = addr; - return addr; + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } static u32 get_block_address(u32 current_addr, u32 low, u32 high, - unsigned int bank, unsigned int block) + unsigned int bank, unsigned int block, + unsigned int cpu) { u32 addr = 0, offset = 0; - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; if (mce_flags.smca) - return smca_get_block_address(bank, block); + return smca_get_block_address(bank, block, cpu); /* Fall back to method we used for older processors: */ switch (block) { @@ -624,18 +625,19 @@ void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; int offset = -1; - for (bank = 0; bank < mca_cfg.banks; ++bank) { + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); disable_err_thresholding(c, bank); for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(address, low, high, bank, block, cpu); if (!address) break; @@ -973,7 +975,7 @@ static void amd_deferred_error_interrupt(void) { unsigned int bank; - for (bank = 0; bank < mca_cfg.banks; ++bank) + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) log_error_deferred(bank); } @@ -1014,7 +1016,7 @@ static void amd_threshold_interrupt(void) struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; unsigned int bank, cpu = smp_processor_id(); - for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; @@ -1201,7 +1203,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, u32 low, high; int err; - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return 0; if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) @@ -1252,7 +1254,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(address, low, high, bank, ++block, cpu); if (!address) return 0; @@ -1435,7 +1437,7 @@ int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; - for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); @@ -1456,14 +1458,14 @@ int mce_threshold_create_device(unsigned int cpu) if (bp) return 0; - bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *), + bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *), GFP_KERNEL); if (!bp) return -ENOMEM; per_cpu(threshold_banks, cpu) = bp; - for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 282916f3b8d8..743370ee4983 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -65,7 +65,23 @@ static DEFINE_MUTEX(mce_sysfs_mutex); DEFINE_PER_CPU(unsigned, mce_exception_count); -struct mce_bank *mce_banks __read_mostly; +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); + +struct mce_bank { + u64 ctl; /* subevents to enable */ + bool init; /* initialise bank? */ +}; +static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + +#define ATTR_LEN 16 +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank_dev { + struct device_attribute attr; /* device attribute */ + char attrname[ATTR_LEN]; /* attribute name */ + u8 bank; /* bank number */ +}; +static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS]; + struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { @@ -675,6 +691,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); */ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); bool error_seen = false; struct mce m; int i; @@ -686,7 +703,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (flags & MCP_TIMESTAMP) m.tsc = rdtsc(); - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -788,7 +805,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, char *tmp; int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { m->status = mce_rdmsrl(msr_ops.status(i)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1068,7 +1085,7 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (test_bit(i, toclear)) mce_wrmsrl(msr_ops.status(i), 0); } @@ -1122,10 +1139,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i; - for (i = 0; i < cfg->banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) continue; @@ -1330,7 +1348,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) local_irq_enable(); if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS, current); + force_sig(SIGBUS); local_irq_disable(); ist_end_non_atomic(); } else { @@ -1463,27 +1481,29 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __mcheck_cpu_mce_banks_init(void) +static void __mcheck_cpu_mce_banks_init(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u8 n_banks = this_cpu_read(mce_num_banks); int i; - mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); - if (!mce_banks) - return -ENOMEM; - - for (i = 0; i < MAX_NR_BANKS; i++) { + for (i = 0; i < n_banks; i++) { struct mce_bank *b = &mce_banks[i]; + /* + * Init them all, __mcheck_cpu_apply_quirks() is going to apply + * the required vendor quirks before + * __mcheck_cpu_init_clear_banks() does the final bank setup. + */ b->ctl = -1ULL; b->init = 1; } - return 0; } /* * Initialize Machine Checks for a CPU. */ -static int __mcheck_cpu_cap_init(void) +static void __mcheck_cpu_cap_init(void) { u64 cap; u8 b; @@ -1491,16 +1511,16 @@ static int __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (WARN_ON_ONCE(b > MAX_NR_BANKS)) + + if (b > MAX_NR_BANKS) { + pr_warn("CPU%d: Using only %u machine check banks out of %u\n", + smp_processor_id(), MAX_NR_BANKS, b); b = MAX_NR_BANKS; + } - mca_cfg.banks = max(mca_cfg.banks, b); + this_cpu_write(mce_num_banks, b); - if (!mce_banks) { - int err = __mcheck_cpu_mce_banks_init(); - if (err) - return err; - } + __mcheck_cpu_mce_banks_init(); /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) @@ -1508,8 +1528,6 @@ static int __mcheck_cpu_cap_init(void) if (cap & MCG_SER_P) mca_cfg.ser = 1; - - return 0; } static void __mcheck_cpu_init_generic(void) @@ -1536,9 +1554,10 @@ static void __mcheck_cpu_init_generic(void) static void __mcheck_cpu_init_clear_banks(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1549,6 +1568,33 @@ static void __mcheck_cpu_init_clear_banks(void) } /* + * Do a final check to see if there are any unused/RAZ banks. + * + * This must be done after the banks have been initialized and any quirks have + * been applied. + * + * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. + * Otherwise, a user who disables a bank will not be able to re-enable it + * without a system reboot. + */ +static void __mcheck_cpu_check_banks(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + + rdmsrl(msr_ops.ctl(i), msrval); + b->init = !!msrval; + } +} + +/* * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM * Vol 3B Table 15-20). But this confuses both the code that determines @@ -1579,6 +1625,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; if (c->x86_vendor == X86_VENDOR_UNKNOWN) { @@ -1588,7 +1635,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && cfg->banks > 4) { + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1607,7 +1654,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && cfg->banks > 0) + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) mce_banks[0].ctl = 0; /* @@ -1629,7 +1676,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) mce_banks[0].init = 0; /* @@ -1815,7 +1862,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; - if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { + __mcheck_cpu_cap_init(); + + if (__mcheck_cpu_apply_quirks(c) < 0) { mca_cfg.disabled = 1; return; } @@ -1832,6 +1881,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_check_banks(); __mcheck_cpu_setup_timer(); } @@ -1863,7 +1913,7 @@ static void __mce_disable_bank(void *arg) void mce_disable_bank(int bank) { - if (bank >= mca_cfg.banks) { + if (bank >= this_cpu_read(mce_num_banks)) { pr_warn(FW_BUG "Ignoring request to disable invalid MCA bank %d.\n", bank); @@ -1949,9 +1999,10 @@ int __init mcheck_init(void) */ static void mce_disable_error_reporting(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2051,26 +2102,47 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) +static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr) { - return container_of(attr, struct mce_bank, attr); + return container_of(attr, struct mce_bank_dev, attr); } static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; + + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + return sprintf(buf, "%llx\n", b->ctl); } static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; u64 new; if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; - attr_to_bank(attr)->ctl = new; + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + b->ctl = new; mce_restart(); return size; @@ -2185,7 +2257,7 @@ static void mce_device_release(struct device *dev) kfree(dev); } -/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +/* Per CPU device init. All of the CPUs still share the same bank device: */ static int mce_device_create(unsigned int cpu) { struct device *dev; @@ -2217,8 +2289,8 @@ static int mce_device_create(unsigned int cpu) if (err) goto error; } - for (j = 0; j < mca_cfg.banks; j++) { - err = device_create_file(dev, &mce_banks[j].attr); + for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) { + err = device_create_file(dev, &mce_bank_devs[j].attr); if (err) goto error2; } @@ -2228,7 +2300,7 @@ static int mce_device_create(unsigned int cpu) return 0; error2: while (--j >= 0) - device_remove_file(dev, &mce_banks[j].attr); + device_remove_file(dev, &mce_bank_devs[j].attr); error: while (--i >= 0) device_remove_file(dev, mce_device_attrs[i]); @@ -2249,8 +2321,8 @@ static void mce_device_remove(unsigned int cpu) for (i = 0; mce_device_attrs[i]; i++) device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < mca_cfg.banks; i++) - device_remove_file(dev, &mce_banks[i].attr); + for (i = 0; i < per_cpu(mce_num_banks, cpu); i++) + device_remove_file(dev, &mce_bank_devs[i].attr); device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); @@ -2271,6 +2343,7 @@ static void mce_disable_cpu(void) static void mce_reenable_cpu(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) @@ -2278,7 +2351,7 @@ static void mce_reenable_cpu(void) if (!cpuhp_tasks_frozen) cmci_reenable(); - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2328,10 +2401,12 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; + for (i = 0; i < MAX_NR_BANKS; i++) { + struct mce_bank_dev *b = &mce_bank_devs[i]; struct device_attribute *a = &b->attr; + b->bank = i; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); @@ -2441,22 +2516,16 @@ static int fake_panic_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mcheck_debugfs_init(void) +static void __init mcheck_debugfs_init(void) { - struct dentry *dmce, *ffake_panic; + struct dentry *dmce; dmce = mce_get_debugfs_dir(); - if (!dmce) - return -ENOMEM; - ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, - NULL, &fake_panic_fops); - if (!ffake_panic) - return -ENOMEM; - - return 0; + debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); } #else -static int __init mcheck_debugfs_init(void) { return -EINVAL; } +static void __init mcheck_debugfs_init(void) { } #endif DEFINE_STATIC_KEY_FALSE(mcsafe_key); @@ -2464,8 +2533,6 @@ EXPORT_SYMBOL_GPL(mcsafe_key); static int __init mcheck_late_init(void) { - pr_info("Using %d MCE banks\n", mca_cfg.banks); - if (mca_cfg.recovery) static_branch_inc(&mcsafe_key); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 5d108f70f315..1f30117b24ba 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -645,7 +645,6 @@ static const struct file_operations readme_fops = { static struct dfs_node { char *name; - struct dentry *d; const struct file_operations *fops; umode_t perm; } dfs_fls[] = { @@ -659,49 +658,23 @@ static struct dfs_node { { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, }; -static int __init debugfs_init(void) +static void __init debugfs_init(void) { unsigned int i; dfs_inj = debugfs_create_dir("mce-inject", NULL); - if (!dfs_inj) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - dfs_fls[i].perm, - dfs_inj, - &i_mce, - dfs_fls[i].fops); - - if (!dfs_fls[i].d) - goto err_dfs_add; - } - - return 0; - -err_dfs_add: - while (i-- > 0) - debugfs_remove(dfs_fls[i].d); - debugfs_remove(dfs_inj); - dfs_inj = NULL; - - return -ENODEV; + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) + debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj, + &i_mce, dfs_fls[i].fops); } static int __init inject_init(void) { - int err; - if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; - err = debugfs_init(); - if (err) { - free_cpumask_var(mce_inject_cpumask); - return err; - } + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); mce_register_injector_chain(&inject_nb); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index a34b55baa7aa..43031db429d2 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -22,17 +22,8 @@ enum severity_level { extern struct blocking_notifier_head x86_mce_decoder_chain; -#define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ -/* One object for each MCE bank, shared by all CPUs */ -struct mce_bank { - u64 ctl; /* subevents to enable */ - unsigned char init; /* initialise bank? */ - struct device_attribute attr; /* device attribute */ - char attrname[ATTR_LEN]; /* attribute name */ -}; - struct mce_evt_llist { struct llist_node llnode; struct mce mce; @@ -47,7 +38,6 @@ struct llist_node *mce_gen_pool_prepare_records(void); extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); -extern struct mce_bank *mce_banks; extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL @@ -128,7 +118,6 @@ struct mca_config { bios_cmci_threshold : 1, __reserved : 59; - u8 banks; s8 bootlog; int tolerant; int monarch_timeout; @@ -137,6 +126,7 @@ struct mca_config { }; extern struct mca_config mca_cfg; +DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_vendor_flags { /* diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 2d33a26d257e..210f1f5db5f7 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -400,21 +400,13 @@ static const struct file_operations severities_coverage_fops = { static int __init severities_debugfs_init(void) { - struct dentry *dmce, *fsev; + struct dentry *dmce; dmce = mce_get_debugfs_dir(); - if (!dmce) - goto err_out; - - fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, - &severities_coverage_fops); - if (!fsev) - goto err_out; + debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); return 0; - -err_out: - return -ENOMEM; } late_initcall(severities_debugfs_init); #endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 4ddadf672ab5..a0e52bd00ecc 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -59,7 +59,7 @@ static u8 amd_ucode_patch[PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/microcode.txt + * format. See Documentation/x86/microcode.rst */ static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index d0dfb892c72f..aed45b8895d5 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -4,6 +4,8 @@ # Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # +set -e + IN=$1 OUT=$2 diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 7df29f08871b..062f77279ce3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -17,6 +17,7 @@ #include <linux/irq.h> #include <linux/kexec.h> #include <linux/i8253.h> +#include <linux/random.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> @@ -80,6 +81,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ack_APIC_irq(); exiting_irq(); @@ -89,7 +91,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) { *vector = HYPERV_STIMER0_VECTOR; - *irq = 0; /* Unused on x86/x64 */ + *irq = -1; /* Unused on x86/x64 */ hv_stimer0_handler = handler; return 0; } @@ -266,9 +268,9 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); - lapic_timer_frequency = hv_lapic_frequency; + lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", - lapic_timer_frequency); + lapic_timer_period); } register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9356c1c9024d..aa5c064a6a22 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - wbinvd(); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (boot_cpu_has(X86_FEATURE_PGE)) { @@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - wbinvd(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 604c0e3bcc83..d7623e1b927d 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -431,11 +431,7 @@ static int pseudo_lock_fn(void *_rdtgrp) #else register unsigned int line_size asm("esi"); register unsigned int size asm("edi"); -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ + register void *mem_r asm(_ASM_BX); #endif /* CONFIG_KASAN */ /* @@ -1503,7 +1499,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * may be scheduled elsewhere and invalidate entries in the * pseudo-locked region. */ - if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { + if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 2f4824793798..bf3034994754 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2488,21 +2488,21 @@ out_destroy: * modification to the CBM if the default does not satisfy the * requirements. */ -static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) { - unsigned long val = *_val; unsigned int cbm_len = r->cache.cbm_len; unsigned long first_bit, zero_bit; + unsigned long val = _val; - if (val == 0) - return; + if (!val) + return 0; first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); /* Clear any remaining bits to ensure contiguous region */ bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - *_val = (u32)val; + return (u32)val; } /* @@ -2560,7 +2560,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, * Force the initial CBM to be valid, user can * modify the CBM based on system availability. */ - cbm_ensure_valid(&d->new_ctrl, r); + d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r); /* * Assign the u32 CBM to an unsigned long to ensure that * bitmap_weight() does not access out-of-bound memory. diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 94aa1c72ca98..adf9b71386ef 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,10 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 8f6c784141d1..ee48c3fc8a65 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -15,33 +15,66 @@ /* leaf 0xb SMT level */ #define SMT_LEVEL 0 -/* leaf 0xb sub-leaf types */ +/* extended topology sub-leaf types */ #define INVALID_TYPE 0 #define SMT_TYPE 1 #define CORE_TYPE 2 +#define DIE_TYPE 5 #define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -int detect_extended_topology_early(struct cpuinfo_x86 *c) -{ #ifdef CONFIG_SMP +unsigned int __max_die_per_package __read_mostly = 1; +EXPORT_SYMBOL(__max_die_per_package); + +/* + * Check if given CPUID extended toplogy "leaf" is implemented + */ +static int check_extended_topology_leaf(int leaf) +{ unsigned int eax, ebx, ecx, edx; - if (c->cpuid_level < 0xb) + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) return -1; - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + return 0; +} +/* + * Return best CPUID Extended Toplogy Leaf supported + */ +static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +{ + if (c->cpuid_level >= 0x1f) { + if (check_extended_topology_leaf(0x1f) == 0) + return 0x1f; + } - /* - * check if the cpuid leaf 0xb is actually implemented. - */ - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + if (c->cpuid_level >= 0xb) { + if (check_extended_topology_leaf(0xb) == 0) + return 0xb; + } + + return -1; +} +#endif + +int detect_extended_topology_early(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx; + int leaf; + + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) return -1; set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); /* * initial apic id, which also represents 32-bit extended x2apic id. */ @@ -52,7 +85,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) } /* - * Check for extended topology enumeration cpuid leaf 0xb and if it + * Check for extended topology enumeration cpuid leaf, and if it * exists, use it for populating initial_apicid and cpu topology * detection. */ @@ -60,22 +93,28 @@ int detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; + unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; + unsigned int die_select_mask, die_level_siblings; + int leaf; - if (detect_extended_topology_early(c) < 0) + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) return -1; /* * Populate HT related information from sub-leaf level 0. */ - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + c->initial_apicid = edx; core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* * Check for the Core type in the implemented sub leaves. @@ -83,23 +122,34 @@ int detect_extended_topology(struct cpuinfo_x86 *c) if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - break; + die_level_siblings = core_level_siblings; + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + } + if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } sub_index++; } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; - - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) - & core_select_mask; - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + die_select_mask = (~(-1 << die_plus_mask_width)) >> + core_plus_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + ht_mask_width) & core_select_mask; + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, + die_plus_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); + __max_die_per_package = (die_level_siblings / core_level_siblings); #endif return 0; } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c new file mode 100644 index 000000000000..6a204e7336c1 --- /dev/null +++ b/arch/x86/kernel/cpu/umwait.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/syscore_ops.h> +#include <linux/suspend.h> +#include <linux/cpu.h> + +#include <asm/msr.h> + +#define UMWAIT_C02_ENABLE 0 + +#define UMWAIT_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) + +/* + * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default, + * umwait max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); + +/* + * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(umwait_lock); + +static void umwait_update_control_msr(void * unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of umwait_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int umwait_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + umwait_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void umwait_syscore_resume(void) +{ + umwait_update_control_msr(NULL); +} + +static struct syscore_ops umwait_syscore_ops = { + .resume = umwait_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool umwait_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE); +} + +static inline u32 umwait_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; +} + +static inline void umwait_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE; + + WRITE_ONCE(umwait_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(umwait_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (c02_enable != umwait_ctrl_c02_enabled(ctrl)) + umwait_update_control(ctrl, c02_enable); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (max_time != umwait_ctrl_max_time(ctrl)) + umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *umwait_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group umwait_attr_group = { + .attrs = umwait_attrs, + .name = "umwait_control", +}; + +static int __init umwait_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_WAITPKG)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", + umwait_cpu_online, NULL); + + register_syscore_ops(&umwait_syscore_ops); + + /* + * Add umwait control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &umwait_attr_group); +} +device_initcall(umwait_init); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 0eda91f8eeac..3c648476d4fb 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -157,7 +157,7 @@ static void __init vmware_platform_setup(void) #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ - lapic_timer_frequency = ecx / HZ; + lapic_timer_period = ecx / HZ; pr_info("Host bus clock speed read from hypervisor : %u Hz\n", ecx); #endif diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c new file mode 100644 index 000000000000..8e6f2f4b4afe --- /dev/null +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched.h> +#include <linux/sched/clock.h> + +#include <asm/cpufeature.h> + +#include "cpu.h" + +#define MSR_ZHAOXIN_FCR57 0x00001257 + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ + +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + +static void init_zhaoxin_cap(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* Enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable ACE unit */ + lo |= ACE_FCR; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* Enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable RNG unit */ + lo |= RNG_ENABLE; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* + * Store Extended Feature Flags as word 5 of the CPU + * capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } + + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + cpu_detect_cache_sizes(c); +} + +static void early_init_zhaoxin(struct cpuinfo_x86 *c) +{ + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + +} + +static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + +static void init_zhaoxin(struct cpuinfo_x86 *c) +{ + early_init_zhaoxin(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (c->x86 >= 0x6) + init_zhaoxin_cap(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + if (cpu_has(c, X86_FEATURE_VMX)) + zhaoxin_detect_vmx_virtcap(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + return size; +} +#endif + +static const struct cpu_dev zhaoxin_cpu_dev = { + .c_vendor = "zhaoxin", + .c_ident = { " Shanghai " }, + .c_early_init = early_init_zhaoxin, + .c_init = init_zhaoxin, +#ifdef CONFIG_X86_32 + .legacy_cache_size = zhaoxin_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_ZHAOXIN, +}; + +cpu_dev_register(zhaoxin_cpu_dev); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 576b2e1bfc12..2bf70a2fed90 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -56,7 +56,6 @@ struct crash_memmap_data { */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); -unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -73,14 +72,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { -#ifdef CONFIG_X86_32 - struct pt_regs fixed_regs; - - if (!user_mode(regs)) { - crash_fixup_ss_esp(&fixed_regs, regs); - regs = &fixed_regs; - } -#endif crash_save_cpu(regs, cpu); /* @@ -181,6 +172,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE + +static unsigned long crash_zero_bytes; + static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -381,6 +375,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); + /* Add e820 reserved ranges */ + cmd.type = E820_TYPE_RESERVED; + flags = IORESOURCE_MEM; + walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, + memmap_entry_callback); + /* Add crashk_low_res region */ if (crashk_low_res.end) { ei.addr = crashk_low_res.start; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 8f32e705a980..e69408bf664b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1063,10 +1063,10 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ - case E820_TYPE_RESERVED: /* Fall-through: */ default: return IORES_DESC_NONE; } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 649fbc3fcf9f..12c70840980e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -43,18 +43,6 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static void kernel_fpu_disable(void) -{ - WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); -} - -static void kernel_fpu_enable(void) -{ - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, false); -} - static bool kernel_fpu_disabled(void) { return this_cpu_read(in_kernel_fpu); @@ -94,42 +82,33 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); -static void __kernel_fpu_begin(void) +void kernel_fpu_begin(void) { - struct fpu *fpu = ¤t->thread.fpu; + preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - kernel_fpu_disable(); + this_cpu_write(in_kernel_fpu, true); - if (!(current->flags & PF_KTHREAD)) { - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - set_thread_flag(TIF_NEED_FPU_LOAD); - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ - copy_fpregs_to_fpstate(fpu); - } + if (!(current->flags & PF_KTHREAD) && + !test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ + copy_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); } - -static void __kernel_fpu_end(void) -{ - kernel_fpu_enable(); -} - -void kernel_fpu_begin(void) -{ - preempt_disable(); - __kernel_fpu_begin(); -} EXPORT_SYMBOL_GPL(kernel_fpu_begin); void kernel_fpu_end(void) { - __kernel_fpu_end(); + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); @@ -155,7 +134,6 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_after_save(fpu); fpregs_unlock(); } -EXPORT_SYMBOL_GPL(fpu__save); /* * Legacy x87 fpstate state init: diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index ef0030e3fe6b..6ce7e0a23268 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -204,12 +204,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ if (!boot_cpu_has(X86_FEATURE_FPU)) { - /* - * Disable xsave as we do not support it if i387 - * emulation is enabled. - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); fpu_kernel_xstate_size = sizeof(struct swregs_state); } else { if (boot_cpu_has(X86_FEATURE_FXSR)) @@ -252,17 +246,20 @@ static void __init fpu__init_parse_early_param(void) char *argptr = arg; int bit; +#ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) { + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); - setup_clear_cpu_cap(X86_FEATURE_XMM); - } +#endif if (cmdline_find_option_bool(boot_command_line, "noxsave")) - fpu__xstate_clear_all_cpu_caps(); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3c36dd1784db..e5cb67d67c03 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -8,6 +8,8 @@ #include <linux/cpu.h> #include <linux/mman.h> #include <linux/pkeys.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <asm/fpu/api.h> #include <asm/fpu/internal.h> @@ -68,15 +70,6 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; unsigned int fpu_user_xstate_size; /* - * Clear all of the X86_FEATURE_* bits that are unavailable - * when the CPU has no XSAVE support. - */ -void fpu__xstate_clear_all_cpu_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); -} - -/* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: @@ -709,7 +702,7 @@ static void fpu__init_disable_system_xstate(void) { xfeatures_mask = 0; cr4_clear_bits(X86_CR4_OSXSAVE); - fpu__xstate_clear_all_cpu_caps(); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); } /* @@ -1240,3 +1233,48 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } + +#ifdef CONFIG_PROC_PID_ARCH_STATUS +/* + * Report the amount of time elapsed in millisecond since last AVX512 + * use in the task. + */ +static void avx512_status(struct seq_file *m, struct task_struct *task) +{ + unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + long delta; + + if (!timestamp) { + /* + * Report -1 if no AVX512 usage + */ + delta = -1; + } else { + delta = (long)(jiffies - timestamp); + /* + * Cap to LONG_MAX if time difference > LONG_MAX + */ + if (delta < 0) + delta = LONG_MAX; + delta = jiffies_to_msecs(delta); + } + + seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); + seq_putc(m, '\n'); +} + +/* + * Report architecture specific information + */ +int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + /* + * Report AVX512 state if the processor and build option supported. + */ + if (cpu_feature_enabled(X86_FEATURE_AVX512F)) + avx512_status(m, task); + + return 0; +} +#endif /* CONFIG_PROC_PID_ARCH_STATUS */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0927bb158ffc..4b73f5937f41 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/module.h> +#include <linux/memory.h> #include <trace/syscall.h> @@ -34,16 +35,25 @@ #ifdef CONFIG_DYNAMIC_FTRACE int ftrace_arch_code_modify_prepare(void) + __acquires(&text_mutex) { + /* + * Need to grab text_mutex to prevent a race from module loading + * and live kernel patching from changing the text permissions while + * ftrace has it set to "read/write". + */ + mutex_lock(&text_mutex); set_kernel_text_rw(); set_all_modules_text_rw(); return 0; } int ftrace_arch_code_modify_post_process(void) + __releases(&text_mutex) { set_all_modules_text_ro(); set_kernel_text_ro(); + mutex_unlock(&text_mutex); return 0; } @@ -300,7 +310,6 @@ int ftrace_int3_handler(struct pt_regs *regs) ip = regs->ip - INT3_INSN_SIZE; -#ifdef CONFIG_X86_64 if (ftrace_location(ip)) { int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); return 1; @@ -312,12 +321,6 @@ int ftrace_int3_handler(struct pt_regs *regs) int3_emulate_call(regs, ftrace_update_func_call); return 1; } -#else - if (ftrace_location(ip) || is_ftrace_caller(ip)) { - int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); - return 1; - } -#endif return 0; } diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 2ba914a34b06..073aab525d80 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -9,6 +9,8 @@ #include <asm/export.h> #include <asm/ftrace.h> #include <asm/nospec-branch.h> +#include <asm/frame.h> +#include <asm/asm-offsets.h> # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -89,26 +91,38 @@ END(ftrace_caller) ENTRY(ftrace_regs_caller) /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * regs->ip location, and move flags into the return ip location. + * We're here from an mcount/fentry CALL, and the stack frame looks like: + * + * <previous context> + * RET-IP + * + * The purpose of this function is to call out in an emulated INT3 + * environment with a stack frame like: + * + * <previous context> + * gap / RET-IP + * gap + * gap + * gap + * pt_regs + * + * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds */ - pushl $__KERNEL_CS - pushl 4(%esp) /* Save the return ip */ - pushl $0 /* Load 0 into orig_ax */ + subl $3*4, %esp # RET-IP + 3 gaps + pushl %ss # ss + pushl %esp # points at ss + addl $5*4, (%esp) # make it point at <previous context> + pushfl # flags + pushl $__KERNEL_CS # cs + pushl 7*4(%esp) # ip <- RET-IP + pushl $0 # orig_eax + pushl %gs pushl %fs pushl %es pushl %ds - pushl %eax - - /* Get flags and place them into the return ip slot */ - pushf - popl %eax - movl %eax, 8*4(%esp) + pushl %eax pushl %ebp pushl %edi pushl %esi @@ -116,24 +130,27 @@ ENTRY(ftrace_regs_caller) pushl %ecx pushl %ebx - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ + ENCODE_FRAME_POINTER + + movl PT_EIP(%esp), %eax # 1st argument: IP + subl $MCOUNT_INSN_SIZE, %eax + movl 21*4(%esp), %edx # 2nd argument: parent ip + movl function_trace_op, %ecx # 3rd argument: ftrace_pos + pushl %esp # 4th argument: pt_regs GLOBAL(ftrace_regs_call) call ftrace_stub - addl $4, %esp /* Skip pt_regs */ + addl $4, %esp # skip 4th argument - /* restore flags */ - push 14*4(%esp) - popf + /* place IP below the new SP */ + movl PT_OLDESP(%esp), %eax + movl PT_EIP(%esp), %ecx + movl %ecx, -4(%eax) - /* Move return ip back to its original location */ - movl 12*4(%esp), %eax - movl %eax, 14*4(%esp) + /* place EAX below that */ + movl PT_EAX(%esp), %ecx + movl %ecx, -8(%eax) popl %ebx popl %ecx @@ -141,14 +158,9 @@ GLOBAL(ftrace_regs_call) popl %esi popl %edi popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - /* use lea to not affect flags */ - lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + lea -8(%eax), %esp + popl %eax jmp .Lftrace_ret diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 10eb2760ef2c..809d54397dba 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -9,6 +9,7 @@ #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> +#include <asm/frame.h> .code64 .section .entry.text, "ax" @@ -203,6 +204,8 @@ GLOBAL(ftrace_regs_caller_op_ptr) leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx movq %rcx, RSP(%rsp) + ENCODE_FRAME_POINTER + /* regs go into 4th parameter */ leaq (%rsp), %rcx diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a0573f2e7763..c43e96a938d0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,32 +1,44 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> -#include <linux/irq.h> #include <linux/export.h> #include <linux/delay.h> -#include <linux/errno.h> -#include <linux/i8253.h> -#include <linux/slab.h> #include <linux/hpet.h> -#include <linux/init.h> #include <linux/cpu.h> -#include <linux/pm.h> -#include <linux/io.h> +#include <linux/irq.h> -#include <asm/cpufeature.h> -#include <asm/irqdomain.h> -#include <asm/fixmap.h> #include <asm/hpet.h> #include <asm/time.h> -#define HPET_MASK CLOCKSOURCE_MASK(32) +#undef pr_fmt +#define pr_fmt(fmt) "hpet: " fmt -#define HPET_DEV_USED_BIT 2 -#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) -#define HPET_DEV_VALID 0x8 -#define HPET_DEV_FSB_CAP 0x1000 -#define HPET_DEV_PERI_CAP 0x2000 +enum hpet_mode { + HPET_MODE_UNUSED, + HPET_MODE_LEGACY, + HPET_MODE_CLOCKEVT, + HPET_MODE_DEVICE, +}; + +struct hpet_channel { + struct clock_event_device evt; + unsigned int num; + unsigned int cpu; + unsigned int irq; + unsigned int in_use; + enum hpet_mode mode; + unsigned int boot_cfg; + char name[10]; +}; + +struct hpet_base { + unsigned int nr_channels; + unsigned int nr_clockevents; + unsigned int boot_cfg; + struct hpet_channel *channels; +}; + +#define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) @@ -39,22 +51,25 @@ u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; #ifdef CONFIG_PCI_MSI -static unsigned int hpet_num_timers; +static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); +static struct irq_domain *hpet_domain; #endif + static void __iomem *hpet_virt_address; -struct hpet_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int flags; - char name[10]; -}; +static struct hpet_base hpet_base; + +static bool hpet_legacy_int_enabled; +static unsigned long hpet_freq; -static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +bool boot_hpet_disable; +bool hpet_force_user; +static bool hpet_verbose; + +static inline +struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt) { - return container_of(evtdev, struct hpet_dev, evt); + return container_of(evt, struct hpet_channel, evt); } inline unsigned int hpet_readl(unsigned int a) @@ -67,10 +82,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a) writel(d, hpet_virt_address + a); } -#ifdef CONFIG_X86_64 -#include <asm/pgtable.h> -#endif - static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); @@ -85,10 +96,6 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -bool boot_hpet_disable; -bool hpet_force_user; -static bool hpet_verbose; - static int __init hpet_setup(char *str) { while (str) { @@ -120,13 +127,8 @@ static inline int is_hpet_capable(void) return !boot_hpet_disable && hpet_address; } -/* - * HPET timer interrupt enable / disable - */ -static bool hpet_legacy_int_enabled; - /** - * is_hpet_enabled - check whether the hpet timer interrupt is enabled + * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled */ int is_hpet_enabled(void) { @@ -136,32 +138,36 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); static void _hpet_print_config(const char *function, int line) { - u32 i, timers, l, h; - printk(KERN_INFO "hpet: %s(%d):\n", function, line); - l = hpet_readl(HPET_ID); - h = hpet_readl(HPET_PERIOD); - timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h); - l = hpet_readl(HPET_CFG); - h = hpet_readl(HPET_STATUS); - printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h); + u32 i, id, period, cfg, status, channels, l, h; + + pr_info("%s(%d):\n", function, line); + + id = hpet_readl(HPET_ID); + period = hpet_readl(HPET_PERIOD); + pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period); + + cfg = hpet_readl(HPET_CFG); + status = hpet_readl(HPET_STATUS); + pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status); + l = hpet_readl(HPET_COUNTER); h = hpet_readl(HPET_COUNTER+4); - printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - for (i = 0; i < timers; i++) { + for (i = 0; i < channels; i++) { l = hpet_readl(HPET_Tn_CFG(i)); h = hpet_readl(HPET_Tn_CFG(i)+4); - printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", - i, l, h); + pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h); + l = hpet_readl(HPET_Tn_CMP(i)); h = hpet_readl(HPET_Tn_CMP(i)+4); - printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", - i, l, h); + pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h); + l = hpet_readl(HPET_Tn_ROUTE(i)); h = hpet_readl(HPET_Tn_ROUTE(i)+4); - printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", - i, l, h); + pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h); } } @@ -172,31 +178,20 @@ do { \ } while (0) /* - * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * When the HPET driver (/dev/hpet) is enabled, we need to reserve * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd); - -static void hpet_reserve_platform_timers(unsigned int id) +static void __init hpet_reserve_platform_timers(void) { - struct hpet __iomem *hpet = hpet_virt_address; - struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; - unsigned int nrtimers, i; struct hpet_data hd; - - nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + unsigned int i; memset(&hd, 0, sizeof(hd)); hd.hd_phys_address = hpet_address; - hd.hd_address = hpet; - hd.hd_nirqs = nrtimers; - hpet_reserve_timer(&hd, 0); - -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = hpet_base.nr_channels; /* * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 @@ -206,30 +201,52 @@ static void hpet_reserve_platform_timers(unsigned int id) hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) { - hd.hd_irq[i] = (readl(&timer->hpet_config) & - Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; - } + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + if (i >= 2) + hd.hd_irq[i] = hc->irq; - hpet_reserve_msi_timers(&hd); + switch (hc->mode) { + case HPET_MODE_UNUSED: + case HPET_MODE_DEVICE: + hc->mode = HPET_MODE_DEVICE; + break; + case HPET_MODE_CLOCKEVT: + case HPET_MODE_LEGACY: + hpet_reserve_timer(&hd, hc->num); + break; + } + } hpet_alloc(&hd); +} +static void __init hpet_select_device_channel(void) +{ + int i; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + /* Associate the first unused channel to /dev/hpet */ + if (hc->mode == HPET_MODE_UNUSED) { + hc->mode = HPET_MODE_DEVICE; + return; + } + } } + #else -static void hpet_reserve_platform_timers(unsigned int id) { } +static inline void hpet_reserve_platform_timers(void) { } +static inline void hpet_select_device_channel(void) {} #endif -/* - * Common hpet info - */ -static unsigned long hpet_freq; - -static struct clock_event_device hpet_clockevent; - +/* Common HPET functions */ static void hpet_stop_counter(void) { u32 cfg = hpet_readl(HPET_CFG); + cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -243,6 +260,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { unsigned int cfg = hpet_readl(HPET_CFG); + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -274,24 +292,9 @@ static void hpet_enable_legacy_int(void) hpet_legacy_int_enabled = true; } -static void hpet_legacy_clockevent_register(void) -{ - /* Start HPET legacy interrupts */ - hpet_enable_legacy_int(); - - /* - * Start hpet with the boot cpu mask and make it - * global after the IO_APIC has been initialized. - */ - hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index); - clockevents_config_and_register(&hpet_clockevent, hpet_freq, - HPET_MIN_PROG_DELTA, 0x7FFFFFFF); - global_clock_event = &hpet_clockevent; - printk(KERN_DEBUG "hpet clockevent registered\n"); -} - -static int hpet_set_periodic(struct clock_event_device *evt, int timer) +static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg, cmp, now; uint64_t delta; @@ -300,11 +303,11 @@ static int hpet_set_periodic(struct clock_event_device *evt, int timer) delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned int)delta; - cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - hpet_writel(cmp, HPET_Tn_CMP(timer)); + hpet_writel(cfg, HPET_Tn_CFG(channel)); + hpet_writel(cmp, HPET_Tn_CMP(channel)); udelay(1); /* * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL @@ -313,52 +316,55 @@ static int hpet_set_periodic(struct clock_event_device *evt, int timer) * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel)); hpet_start_counter(); hpet_print_config(); return 0; } -static int hpet_set_oneshot(struct clock_event_device *evt, int timer) +static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg; - cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cfg, HPET_Tn_CFG(channel)); return 0; } -static int hpet_shutdown(struct clock_event_device *evt, int timer) +static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg; - cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cfg, HPET_Tn_CFG(channel)); return 0; } -static int hpet_resume(struct clock_event_device *evt) +static int hpet_clkevt_legacy_resume(struct clock_event_device *evt) { hpet_enable_legacy_int(); hpet_print_config(); return 0; } -static int hpet_next_event(unsigned long delta, - struct clock_event_device *evt, int timer) +static int +hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; u32 cnt; s32 res; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; - hpet_writel(cnt, HPET_Tn_CMP(timer)); + hpet_writel(cnt, HPET_Tn_CMP(channel)); /* * HPETs are a complete disaster. The compare register is @@ -387,360 +393,250 @@ static int hpet_next_event(unsigned long delta, return res < HPET_MIN_CYCLES ? -ETIME : 0; } -static int hpet_legacy_shutdown(struct clock_event_device *evt) +static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating) { - return hpet_shutdown(evt, 0); -} + struct clock_event_device *evt = &hc->evt; -static int hpet_legacy_set_oneshot(struct clock_event_device *evt) -{ - return hpet_set_oneshot(evt, 0); -} + evt->rating = rating; + evt->irq = hc->irq; + evt->name = hc->name; + evt->cpumask = cpumask_of(hc->cpu); + evt->set_state_oneshot = hpet_clkevt_set_state_oneshot; + evt->set_next_event = hpet_clkevt_set_next_event; + evt->set_state_shutdown = hpet_clkevt_set_state_shutdown; -static int hpet_legacy_set_periodic(struct clock_event_device *evt) -{ - return hpet_set_periodic(evt, 0); + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hc->boot_cfg & HPET_TN_PERIODIC) { + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + evt->set_state_periodic = hpet_clkevt_set_state_periodic; + } } -static int hpet_legacy_resume(struct clock_event_device *evt) +static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) { - return hpet_resume(evt); -} + /* + * Start HPET with the boot CPU's cpumask and make it global after + * the IO_APIC has been initialized. + */ + hc->cpu = boot_cpu_data.cpu_index; + strncpy(hc->name, "hpet", sizeof(hc->name)); + hpet_init_clockevent(hc, 50); -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - return hpet_next_event(delta, evt, 0); -} + hc->evt.tick_resume = hpet_clkevt_legacy_resume; -/* - * The hpet clock event device - */ -static struct clock_event_device hpet_clockevent = { - .name = "hpet", - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT, - .set_state_periodic = hpet_legacy_set_periodic, - .set_state_oneshot = hpet_legacy_set_oneshot, - .set_state_shutdown = hpet_legacy_shutdown, - .tick_resume = hpet_legacy_resume, - .set_next_event = hpet_legacy_next_event, - .irq = 0, - .rating = 50, -}; + /* + * Legacy horrors and sins from the past. HPET used periodic mode + * unconditionally forever on the legacy channel 0. Removing the + * below hack and using the conditional in hpet_init_clockevent() + * makes at least Qemu and one hardware machine fail to boot. + * There are two issues which cause the boot failure: + * + * #1 After the timer delivery test in IOAPIC and the IOAPIC setup + * the next interrupt is not delivered despite the HPET channel + * being programmed correctly. Reprogramming the HPET after + * switching to IOAPIC makes it work again. After fixing this, + * the next issue surfaces: + * + * #2 Due to the unconditional periodic mode availability the Local + * APIC timer calibration can hijack the global clockevents + * event handler without causing damage. Using oneshot at this + * stage makes if hang because the HPET does not get + * reprogrammed due to the handler hijacking. Duh, stupid me! + * + * Both issues require major surgery and especially the kick HPET + * again after enabling IOAPIC results in really nasty hackery. + * This 'assume periodic works' magic has survived since HPET + * support got added, so it's questionable whether this should be + * fixed. Both Qemu and the failing hardware machine support + * periodic mode despite the fact that both don't advertise it in + * the configuration register and both need that extra kick after + * switching to IOAPIC. Seems to be a feature... + */ + hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC; + hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic; + + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + clockevents_config_and_register(&hc->evt, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); + global_clock_event = &hc->evt; + pr_debug("Clockevent registered\n"); +} /* * HPET MSI Support */ #ifdef CONFIG_PCI_MSI -static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); -static struct hpet_dev *hpet_devs; -static struct irq_domain *hpet_domain; - void hpet_msi_unmask(struct irq_data *data) { - struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; - /* unmask it */ - cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); cfg |= HPET_TN_ENABLE | HPET_TN_FSB; - hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } void hpet_msi_mask(struct irq_data *data) { - struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; - /* mask it */ - cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); - hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); -} - -void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg) -{ - hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); - hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg) +void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { - msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); - msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); - msg->address_hi = 0; + hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } -static int hpet_msi_shutdown(struct clock_event_device *evt) +static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - return hpet_shutdown(evt, hdev->num); -} - -static int hpet_msi_set_oneshot(struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - return hpet_set_oneshot(evt, hdev->num); -} - -static int hpet_msi_set_periodic(struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - return hpet_set_periodic(evt, hdev->num); -} - -static int hpet_msi_resume(struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - struct irq_data *data = irq_get_irq_data(hdev->irq); + struct hpet_channel *hc = clockevent_to_channel(evt); + struct irq_data *data = irq_get_irq_data(hc->irq); struct msi_msg msg; /* Restore the MSI msg and unmask the interrupt */ irq_chip_compose_msi_msg(data, &msg); - hpet_msi_write(hdev, &msg); + hpet_msi_write(hc, &msg); hpet_msi_unmask(data); return 0; } -static int hpet_msi_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - return hpet_next_event(delta, evt, hdev->num); -} - -static irqreturn_t hpet_interrupt_handler(int irq, void *data) +static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data) { - struct hpet_dev *dev = (struct hpet_dev *)data; - struct clock_event_device *hevt = &dev->evt; + struct hpet_channel *hc = data; + struct clock_event_device *evt = &hc->evt; - if (!hevt->event_handler) { - printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", - dev->num); + if (!evt->event_handler) { + pr_info("Spurious interrupt HPET channel %d\n", hc->num); return IRQ_HANDLED; } - hevt->event_handler(hevt); + evt->event_handler(evt); return IRQ_HANDLED; } -static int hpet_setup_irq(struct hpet_dev *dev) +static int hpet_setup_msi_irq(struct hpet_channel *hc) { - - if (request_irq(dev->irq, hpet_interrupt_handler, + if (request_irq(hc->irq, hpet_msi_interrupt_handler, IRQF_TIMER | IRQF_NOBALANCING, - dev->name, dev)) + hc->name, hc)) return -1; - disable_irq(dev->irq); - irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); - enable_irq(dev->irq); + disable_irq(hc->irq); + irq_set_affinity(hc->irq, cpumask_of(hc->cpu)); + enable_irq(hc->irq); - printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", - dev->name, dev->irq); + pr_debug("%s irq %u for MSI\n", hc->name, hc->irq); return 0; } -/* This should be called in specific @cpu */ -static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +/* Invoked from the hotplug callback on @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu) { - struct clock_event_device *evt = &hdev->evt; - - WARN_ON(cpu != smp_processor_id()); - if (!(hdev->flags & HPET_DEV_VALID)) - return; - - hdev->cpu = cpu; - per_cpu(cpu_hpet_dev, cpu) = hdev; - evt->name = hdev->name; - hpet_setup_irq(hdev); - evt->irq = hdev->irq; + struct clock_event_device *evt = &hc->evt; - evt->rating = 110; - evt->features = CLOCK_EVT_FEAT_ONESHOT; - if (hdev->flags & HPET_DEV_PERI_CAP) { - evt->features |= CLOCK_EVT_FEAT_PERIODIC; - evt->set_state_periodic = hpet_msi_set_periodic; - } + hc->cpu = cpu; + per_cpu(cpu_hpet_channel, cpu) = hc; + hpet_setup_msi_irq(hc); - evt->set_state_shutdown = hpet_msi_shutdown; - evt->set_state_oneshot = hpet_msi_set_oneshot; - evt->tick_resume = hpet_msi_resume; - evt->set_next_event = hpet_msi_next_event; - evt->cpumask = cpumask_of(hdev->cpu); + hpet_init_clockevent(hc, 110); + evt->tick_resume = hpet_clkevt_msi_resume; clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); } -#ifdef CONFIG_HPET -/* Reserve at least one timer for userspace (/dev/hpet) */ -#define RESERVE_TIMERS 1 -#else -#define RESERVE_TIMERS 0 -#endif - -static void hpet_msi_capability_lookup(unsigned int start_timer) +static struct hpet_channel *hpet_get_unused_clockevent(void) { - unsigned int id; - unsigned int num_timers; - unsigned int num_timers_used = 0; - int i, irq; - - if (hpet_msi_disable) - return; - - if (boot_cpu_has(X86_FEATURE_ARAT)) - return; - id = hpet_readl(HPET_ID); - - num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); - num_timers++; /* Value read out starts from 0 */ - hpet_print_config(); - - hpet_domain = hpet_create_irq_domain(hpet_blockid); - if (!hpet_domain) - return; - - hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL); - if (!hpet_devs) - return; - - hpet_num_timers = num_timers; - - for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { - struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); - - /* Only consider HPET timer with MSI support */ - if (!(cfg & HPET_TN_FSB_CAP)) - continue; + int i; - hdev->flags = 0; - if (cfg & HPET_TN_PERIODIC_CAP) - hdev->flags |= HPET_DEV_PERI_CAP; - sprintf(hdev->name, "hpet%d", i); - hdev->num = i; + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; - irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); - if (irq <= 0) + if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use) continue; - - hdev->irq = irq; - hdev->flags |= HPET_DEV_FSB_CAP; - hdev->flags |= HPET_DEV_VALID; - num_timers_used++; - if (num_timers_used == num_possible_cpus()) - break; + hc->in_use = 1; + return hc; } - - printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", - num_timers, num_timers_used); + return NULL; } -#ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd) +static int hpet_cpuhp_online(unsigned int cpu) { - int i; - - if (!hpet_devs) - return; + struct hpet_channel *hc = hpet_get_unused_clockevent(); - for (i = 0; i < hpet_num_timers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; + if (hc) + init_one_hpet_msi_clockevent(hc, cpu); + return 0; +} - if (!(hdev->flags & HPET_DEV_VALID)) - continue; +static int hpet_cpuhp_dead(unsigned int cpu) +{ + struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu); - hd->hd_irq[hdev->num] = hdev->irq; - hpet_reserve_timer(hd, hdev->num); - } + if (!hc) + return 0; + free_irq(hc->irq, hc); + hc->in_use = 0; + per_cpu(cpu_hpet_channel, cpu) = NULL; + return 0; } -#endif -static struct hpet_dev *hpet_get_unused_timer(void) +static void __init hpet_select_clockevents(void) { - int i; + unsigned int i; - if (!hpet_devs) - return NULL; + hpet_base.nr_clockevents = 0; - for (i = 0; i < hpet_num_timers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; + /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */ + if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) + return; - if (!(hdev->flags & HPET_DEV_VALID)) - continue; - if (test_and_set_bit(HPET_DEV_USED_BIT, - (unsigned long *)&hdev->flags)) - continue; - return hdev; - } - return NULL; -} + hpet_print_config(); -struct hpet_work_struct { - struct delayed_work work; - struct completion complete; -}; + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; -static void hpet_work(struct work_struct *w) -{ - struct hpet_dev *hdev; - int cpu = smp_processor_id(); - struct hpet_work_struct *hpet_work; + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + int irq; - hpet_work = container_of(w, struct hpet_work_struct, work.work); + if (hc->mode != HPET_MODE_UNUSED) + continue; - hdev = hpet_get_unused_timer(); - if (hdev) - init_one_hpet_msi_clockevent(hdev, cpu); + /* Only consider HPET channel with MSI support */ + if (!(hc->boot_cfg & HPET_TN_FSB_CAP)) + continue; - complete(&hpet_work->complete); -} + sprintf(hc->name, "hpet%d", i); -static int hpet_cpuhp_online(unsigned int cpu) -{ - struct hpet_work_struct work; - - INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); - init_completion(&work.complete); - /* FIXME: add schedule_work_on() */ - schedule_delayed_work_on(cpu, &work.work, 0); - wait_for_completion(&work.complete); - destroy_delayed_work_on_stack(&work.work); - return 0; -} + irq = hpet_assign_irq(hpet_domain, hc, hc->num); + if (irq <= 0) + continue; -static int hpet_cpuhp_dead(unsigned int cpu) -{ - struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + hc->irq = irq; + hc->mode = HPET_MODE_CLOCKEVT; - if (!hdev) - return 0; - free_irq(hdev->irq, hdev); - hdev->flags &= ~HPET_DEV_USED; - per_cpu(cpu_hpet_dev, cpu) = NULL; - return 0; -} -#else + if (++hpet_base.nr_clockevents == num_possible_cpus()) + break; + } -static void hpet_msi_capability_lookup(unsigned int start_timer) -{ - return; + pr_info("%d channels of %d reserved for per-cpu timers\n", + hpet_base.nr_channels, hpet_base.nr_clockevents); } -#ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd) -{ - return; -} -#endif +#else + +static inline void hpet_select_clockevents(void) { } #define hpet_cpuhp_online NULL #define hpet_cpuhp_dead NULL @@ -754,10 +650,10 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) /* * Reading the HPET counter is a very slow operation. If a large number of * CPUs are trying to access the HPET counter simultaneously, it can cause - * massive delay and slow down system performance dramatically. This may + * massive delays and slow down system performance dramatically. This may * happen when HPET is the default clock source instead of TSC. For a * really large system with hundreds of CPUs, the slowdown may be so - * severe that it may actually crash the system because of a NMI watchdog + * severe, that it can actually crash the system because of a NMI watchdog * soft lockup, for example. * * If multiple CPUs are trying to access the HPET counter at the same time, @@ -766,10 +662,9 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) * * This special feature is only enabled on x86-64 systems. It is unlikely * that 32-bit x86 systems will have enough CPUs to require this feature - * with its associated locking overhead. And we also need 64-bit atomic - * read. + * with its associated locking overhead. We also need 64-bit atomic read. * - * The lock and the hpet value are stored together and can be read in a + * The lock and the HPET value are stored together and can be read in a * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t * is 32 bits in size. */ @@ -858,15 +753,40 @@ static struct clocksource clocksource_hpet = { .resume = hpet_resume_counter, }; -static int hpet_clocksource_register(void) +/* + * AMD SB700 based systems with spread spectrum enabled use a SMM based + * HPET emulation to provide proper frequency setting. + * + * On such systems the SMM code is initialized with the first HPET register + * access and takes some time to complete. During this time the config + * register reads 0xffffffff. We check for max 1000 loops whether the + * config register reads a non-0xffffffff value to make sure that the + * HPET is up and running before we proceed any further. + * + * A counting loop is safe, as the HPET access takes thousands of CPU cycles. + * + * On non-SB700 based machines this check is only done once and has no + * side effects. + */ +static bool __init hpet_cfg_working(void) { - u64 start, now; - u64 t1; + int i; + + for (i = 0; i < 1000; i++) { + if (hpet_readl(HPET_CFG) != 0xFFFFFFFF) + return true; + } + + pr_warn("Config register invalid. Disabling HPET\n"); + return false; +} + +static bool __init hpet_counting(void) +{ + u64 start, now, t1; - /* Start the counter */ hpet_restart_counter(); - /* Verify whether hpet counter works */ t1 = hpet_readl(HPET_COUNTER); start = rdtsc(); @@ -877,30 +797,24 @@ static int hpet_clocksource_register(void) * 1 GHz == 200us */ do { - rep_nop(); + if (t1 != hpet_readl(HPET_COUNTER)) + return true; now = rdtsc(); } while ((now - start) < 200000UL); - if (t1 == hpet_readl(HPET_COUNTER)) { - printk(KERN_WARNING - "HPET counter not counting. HPET disabled\n"); - return -ENODEV; - } - - clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); - return 0; + pr_warn("Counter not counting. HPET disabled\n"); + return false; } -static u32 *hpet_boot_cfg; - /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { - u32 hpet_period, cfg, id; + u32 hpet_period, cfg, id, irq; + unsigned int i, channels; + struct hpet_channel *hc; u64 freq; - unsigned int i, last; if (!is_hpet_capable()) return 0; @@ -909,40 +823,22 @@ int __init hpet_enable(void) if (!hpet_virt_address) return 0; + /* Validate that the config register is working */ + if (!hpet_cfg_working()) + goto out_nohpet; + + /* Validate that the counter is counting */ + if (!hpet_counting()) + goto out_nohpet; + /* * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); - - /* - * AMD SB700 based systems with spread spectrum enabled use a - * SMM based HPET emulation to provide proper frequency - * setting. The SMM code is initialized with the first HPET - * register access and takes some time to complete. During - * this time the config register reads 0xffffffff. We check - * for max. 1000 loops whether the config register reads a non - * 0xffffffff value to make sure that HPET is up and running - * before we go further. A counting loop is safe, as the HPET - * access takes thousands of CPU cycles. On non SB700 based - * machines this check is only done once and has no side - * effects. - */ - for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { - if (i == 1000) { - printk(KERN_WARNING - "HPET config register value = 0xFFFFFFFF. " - "Disabling HPET\n"); - goto out_nohpet; - } - } - if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; - /* - * The period is a femto seconds value. Convert it to a - * frequency. - */ + /* The period is a femtoseconds value. Convert it to a frequency. */ freq = FSEC_PER_SEC; do_div(freq, hpet_period); hpet_freq = freq; @@ -954,72 +850,90 @@ int __init hpet_enable(void) id = hpet_readl(HPET_ID); hpet_print_config(); - last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + /* This is the HPET channel number which is zero based */ + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; -#ifdef CONFIG_HPET_EMULATE_RTC /* * The legacy routing mode needs at least two channels, tick timer * and the rtc emulation channel. */ - if (!last) + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2) goto out_nohpet; -#endif + hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL); + if (!hc) { + pr_warn("Disabling HPET.\n"); + goto out_nohpet; + } + hpet_base.channels = hc; + hpet_base.nr_channels = channels; + + /* Read, store and sanitize the global configuration */ cfg = hpet_readl(HPET_CFG); - hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg), - GFP_KERNEL); - if (hpet_boot_cfg) - *hpet_boot_cfg = cfg; - else - pr_warn("HPET initial state will not be saved\n"); + hpet_base.boot_cfg = cfg; cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); if (cfg) - pr_warn("Unrecognized bits %#x set in global cfg\n", cfg); + pr_warn("Global config: Unknown bits %#x\n", cfg); + + /* Read, store and sanitize the per channel configuration */ + for (i = 0; i < channels; i++, hc++) { + hc->num = i; - for (i = 0; i <= last; ++i) { cfg = hpet_readl(HPET_Tn_CFG(i)); - if (hpet_boot_cfg) - hpet_boot_cfg[i + 1] = cfg; + hc->boot_cfg = cfg; + irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; + hc->irq = irq; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(i)); + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE | HPET_TN_FSB | HPET_TN_FSB_CAP); if (cfg) - pr_warn("Unrecognized bits %#x set in cfg#%u\n", - cfg, i); + pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg); } hpet_print_config(); - if (hpet_clocksource_register()) - goto out_nohpet; + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { - hpet_legacy_clockevent_register(); + hpet_legacy_clockevent_register(&hpet_base.channels[0]); + hpet_base.channels[0].mode = HPET_MODE_LEGACY; + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC)) + hpet_base.channels[1].mode = HPET_MODE_LEGACY; return 1; } return 0; out_nohpet: + kfree(hpet_base.channels); + hpet_base.channels = NULL; + hpet_base.nr_channels = 0; hpet_clear_mapping(); hpet_address = 0; return 0; } /* - * Needs to be late, as the reserve_timer code calls kalloc ! + * The late initialization runs after the PCI quirks have been invoked + * which might have detected a system on which the HPET can be enforced. + * + * Also, the MSI machinery is not working yet when the HPET is initialized + * early. * - * Not a problem on i386 as hpet_enable is called from late_time_init, - * but on x86_64 it is necessary ! + * If the HPET is enabled, then: + * + * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y + * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents + * 3) Setup /dev/hpet if CONFIG_HPET=y + * 4) Register hotplug callbacks when clockevents are available */ static __init int hpet_late_init(void) { int ret; - if (boot_hpet_disable) - return -ENODEV; - if (!hpet_address) { if (!force_hpet_address) return -ENODEV; @@ -1031,21 +945,14 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; - if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) - hpet_msi_capability_lookup(2); - else - hpet_msi_capability_lookup(0); - - hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + hpet_select_device_channel(); + hpet_select_clockevents(); + hpet_reserve_platform_timers(); hpet_print_config(); - if (hpet_msi_disable) + if (!hpet_base.nr_clockevents) return 0; - if (boot_cpu_has(X86_FEATURE_ARAT)) - return 0; - - /* This notifier should be called after workqueue is ready */ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", hpet_cpuhp_online, NULL); if (ret) @@ -1064,47 +971,47 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { - if (is_hpet_capable() && hpet_virt_address) { - unsigned int cfg = hpet_readl(HPET_CFG), id, last; - - if (hpet_boot_cfg) - cfg = *hpet_boot_cfg; - else if (hpet_legacy_int_enabled) { - cfg &= ~HPET_CFG_LEGACY; - hpet_legacy_int_enabled = false; - } - cfg &= ~HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); + unsigned int i; + u32 cfg; - if (!hpet_boot_cfg) - return; + if (!is_hpet_capable() || !hpet_virt_address) + return; - id = hpet_readl(HPET_ID); - last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + /* Restore boot configuration with the enable bit cleared */ + cfg = hpet_base.boot_cfg; + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); - for (id = 0; id <= last; ++id) - hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id)); + /* Restore the channel boot configuration */ + for (i = 0; i < hpet_base.nr_channels; i++) + hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i)); - if (*hpet_boot_cfg & HPET_CFG_ENABLE) - hpet_writel(*hpet_boot_cfg, HPET_CFG); - } + /* If the HPET was enabled at boot time, reenable it */ + if (hpet_base.boot_cfg & HPET_CFG_ENABLE) + hpet_writel(hpet_base.boot_cfg, HPET_CFG); } #ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET +/* + * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET * is enabled, we support RTC interrupt functionality in software. + * * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. + * + * 1) Update Interrupt - generate an interrupt, every second, when the + * RTC clock is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2) + * + * (1) and (2) above are implemented using polling at a frequency of 64 Hz: + * DEFAULT_RTC_INT_FREQ. + * + * The exact frequency is a tradeoff between accuracy and interrupt overhead. + * + * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency, + * if it's higher. */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> @@ -1125,7 +1032,7 @@ static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; /* - * Check that the hpet counter c1 is ahead of the c2 + * Check that the HPET counter c1 is ahead of c2 */ static inline int hpet_cnt_ahead(u32 c1, u32 c2) { @@ -1163,8 +1070,8 @@ void hpet_unregister_irq_handler(rtc_irq_handler handler) EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); /* - * Timer 1 for RTC emulation. We use one shot mode, as periodic mode - * is not supported by all HPET implementations for timer 1. + * Channel 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for channel 1. * * hpet_rtc_timer_init() is called when the rtc is initialized. */ @@ -1177,10 +1084,11 @@ int hpet_rtc_timer_init(void) return 0; if (!hpet_default_delta) { + struct clock_event_device *evt = &hpet_base.channels[0].evt; uint64_t clc; - clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; - clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + clc = (uint64_t) evt->mult * NSEC_PER_SEC; + clc >>= evt->shift + DEFAULT_RTC_SHIFT; hpet_default_delta = clc; } @@ -1209,6 +1117,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); static void hpet_disable_rtc_channel(void) { u32 cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T1_CFG); } @@ -1250,8 +1159,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) } EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, - unsigned char sec) +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) { if (!is_hpet_enabled()) return 0; @@ -1271,15 +1179,18 @@ int hpet_set_periodic_freq(unsigned long freq) if (!is_hpet_enabled()) return 0; - if (freq <= DEFAULT_RTC_INT_FREQ) + if (freq <= DEFAULT_RTC_INT_FREQ) { hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; - else { - clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + } else { + struct clock_event_device *evt = &hpet_base.channels[0].evt; + + clc = (uint64_t) evt->mult * NSEC_PER_SEC; do_div(clc, freq); - clc >>= hpet_clockevent.shift; + clc >>= evt->shift; hpet_pie_delta = clc; hpet_pie_limit = 0; } + return 1; } EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); @@ -1317,8 +1228,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", - lost_ints); + pr_warn("Lost %d RTC interrupts\n", lost_ints); } } @@ -1340,8 +1250,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) hpet_prev_update_sec = curr_time.tm_sec; } - if (hpet_rtc_flags & RTC_PIE && - ++hpet_pie_count >= hpet_pie_limit) { + if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) { rtc_int_flag |= RTC_PF; hpet_pie_count = 0; } @@ -1350,7 +1259,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) (curr_time.tm_sec == hpet_alarm_time.tm_sec) && (curr_time.tm_min == hpet_alarm_time.tm_min) && (curr_time.tm_hour == hpet_alarm_time.tm_hour)) - rtc_int_flag |= RTC_AF; + rtc_int_flag |= RTC_AF; if (rtc_int_flag) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 0d307a657abb..2b7999a1a50a 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -8,6 +8,7 @@ #include <linux/timex.h> #include <linux/i8253.h> +#include <asm/apic.h> #include <asm/hpet.h> #include <asm/time.h> #include <asm/smp.h> @@ -18,10 +19,32 @@ */ struct clock_event_device *global_clock_event; -void __init setup_pit_timer(void) +/* + * Modern chipsets can disable the PIT clock which makes it unusable. It + * would be possible to enable the clock but the registers are chipset + * specific and not discoverable. Avoid the whack a mole game. + * + * These platforms have discoverable TSC/CPU frequencies but this also + * requires to know the local APIC timer frequency as it normally is + * calibrated against the PIT interrupt. + */ +static bool __init use_pit(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC)) + return true; + + /* This also returns true when APIC is disabled */ + return apic_needs_pit(); +} + +bool __init pit_timer_init(void) { + if (!use_pit()) + return false; + clockevent_i8253_init(true); global_clock_event = &i8253_clockevent; + return true; } #ifndef CONFIG_X86_64 diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d2482bbbe3d0..87ef69a72c52 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -319,7 +319,8 @@ void __init idt_setup_apic_and_irq_gates(void) #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { set_bit(i, system_vectors); - set_intr_gate(i, spurious_interrupt); + entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); + set_intr_gate(i, entry); } #endif } diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 64b973f0e985..4c407833faca 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -11,10 +11,11 @@ extern struct boot_params boot_params; static enum efi_secureboot_mode get_sb_mode(void) { efi_char16_t efi_SecureBoot_name[] = L"SecureBoot"; + efi_char16_t efi_SetupMode_name[] = L"SecureBoot"; efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; efi_status_t status; unsigned long size; - u8 secboot; + u8 secboot, setupmode; size = sizeof(secboot); @@ -36,7 +37,14 @@ static enum efi_secureboot_mode get_sb_mode(void) return efi_secureboot_mode_unknown; } - if (secboot == 0) { + size = sizeof(setupmode); + status = efi.get_variable(efi_SetupMode_name, &efi_variable_guid, + NULL, &size, &setupmode); + + if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ + setupmode = 0; + + if (secboot == 0 || setupmode == 1) { pr_info("ima: secureboot mode disabled\n"); return efi_secureboot_mode_disabled; } diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 805b7a341aca..fdb6506ceaaa 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -13,7 +13,22 @@ #include <linux/dmi.h> #include <linux/io.h> -int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; +#define IO_DELAY_TYPE_0X80 0 +#define IO_DELAY_TYPE_0XED 1 +#define IO_DELAY_TYPE_UDELAY 2 +#define IO_DELAY_TYPE_NONE 3 + +#if defined(CONFIG_IO_DELAY_0X80) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80 +#elif defined(CONFIG_IO_DELAY_0XED) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED +#elif defined(CONFIG_IO_DELAY_UDELAY) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY +#elif defined(CONFIG_IO_DELAY_NONE) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE +#endif + +int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE; static int __initdata io_delay_override; @@ -24,13 +39,13 @@ void native_io_delay(void) { switch (io_delay_type) { default: - case CONFIG_IO_DELAY_TYPE_0X80: + case IO_DELAY_TYPE_0X80: asm volatile ("outb %al, $0x80"); break; - case CONFIG_IO_DELAY_TYPE_0XED: + case IO_DELAY_TYPE_0XED: asm volatile ("outb %al, $0xed"); break; - case CONFIG_IO_DELAY_TYPE_UDELAY: + case IO_DELAY_TYPE_UDELAY: /* * 2 usecs is an upper-bound for the outb delay but * note that udelay doesn't have the bus-level @@ -39,7 +54,8 @@ void native_io_delay(void) * are shorter until calibrated): */ udelay(2); - case CONFIG_IO_DELAY_TYPE_NONE: + break; + case IO_DELAY_TYPE_NONE: break; } } @@ -47,9 +63,9 @@ EXPORT_SYMBOL(native_io_delay); static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) { - if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { + if (io_delay_type == IO_DELAY_TYPE_0X80) { pr_notice("%s: using 0xed I/O delay port\n", id->ident); - io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + io_delay_type = IO_DELAY_TYPE_0XED; } return 0; @@ -115,13 +131,13 @@ static int __init io_delay_param(char *s) return -EINVAL; if (!strcmp(s, "0x80")) - io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; + io_delay_type = IO_DELAY_TYPE_0X80; else if (!strcmp(s, "0xed")) - io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + io_delay_type = IO_DELAY_TYPE_0XED; else if (!strcmp(s, "udelay")) - io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY; + io_delay_type = IO_DELAY_TYPE_UDELAY; else if (!strcmp(s, "none")) - io_delay_type = CONFIG_IO_DELAY_TYPE_NONE; + io_delay_type = IO_DELAY_TYPE_NONE; else return -EINVAL; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9b68b5b00ac9..4215653f8a8e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -135,7 +135,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_puts(p, " Machine check polls\n"); #endif -#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) @@ -247,7 +247,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) if (!handle_irq(desc, regs)) { ack_APIC_irq(); - if (desc != VECTOR_RETRIGGERED) { + if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) { pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), vector); diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 1b2ee55a2dfb..6857b4577f17 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -45,7 +45,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now) static void __init jailhouse_timer_init(void) { - lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ); + lapic_timer_period = setup_data.apic_khz * (1000 / HZ); } static unsigned long jailhouse_get_tsc(void) @@ -203,7 +203,7 @@ bool jailhouse_paravirt(void) return jailhouse_cpuid_base() != 0; } -static bool jailhouse_x2apic_available(void) +static bool __init jailhouse_x2apic_available(void) { /* * The x2APIC is only available if the root cell enabled it. Jailhouse diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e631c358f7f4..044053235302 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -35,41 +35,43 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __ref __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void __jump_label_set_jump_code(struct jump_entry *entry, + enum jump_label_type type, + union jump_code_union *code, + int init) { - union jump_code_union jmp; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect, *code; + const void *expect; int line; - jmp.jump = 0xe9; - jmp.offset = jump_entry_target(entry) - - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + code->jump = 0xe9; + code->offset = jump_entry_target(entry) - + (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - if (type == JUMP_LABEL_JMP) { - if (init) { - expect = default_nop; line = __LINE__; - } else { - expect = ideal_nop; line = __LINE__; - } - - code = &jmp.code; + if (init) { + expect = default_nop; line = __LINE__; + } else if (type == JUMP_LABEL_JMP) { + expect = ideal_nop; line = __LINE__; } else { - if (init) { - expect = default_nop; line = __LINE__; - } else { - expect = &jmp.code; line = __LINE__; - } - - code = ideal_nop; + expect = code->code; line = __LINE__; } if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) bug_at((void *)jump_entry_code(entry), line); + if (type == JUMP_LABEL_NOP) + memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE); +} + +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + union jump_code_union code; + + __jump_label_set_jump_code(entry, type, &code, init); + /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that @@ -82,12 +84,12 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), code, + text_poke_early((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE); return; } - text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE, + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); } @@ -99,6 +101,75 @@ void arch_jump_label_transform(struct jump_entry *entry, mutex_unlock(&text_mutex); } +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + struct text_poke_loc *tp; + void *entry_code; + + if (system_state == SYSTEM_BOOTING) { + /* + * Fallback to the non-batching mode. + */ + arch_jump_label_transform(entry, type); + return true; + } + + /* + * No more space in the vector, tell upper layer to apply + * the queue before continuing. + */ + if (tp_vec_nr == TP_VEC_MAX) + return false; + + tp = &tp_vec[tp_vec_nr]; + + entry_code = (void *)jump_entry_code(entry); + + /* + * The INT3 handler will do a bsearch in the queue, so we need entries + * to be sorted. We can survive an unsorted list by rejecting the entry, + * forcing the generic jump_label code to apply the queue. Warning once, + * to raise the attention to the case of an unsorted entry that is + * better not happen, because, in the worst case we will perform in the + * same way as we do without batching - with some more overhead. + */ + if (tp_vec_nr > 0) { + int prev = tp_vec_nr - 1; + struct text_poke_loc *prev_tp = &tp_vec[prev]; + + if (WARN_ON_ONCE(prev_tp->addr > entry_code)) + return false; + } + + __jump_label_set_jump_code(entry, type, + (union jump_code_union *) &tp->opcode, 0); + + tp->addr = entry_code; + tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; + tp->len = JUMP_LABEL_NOP_SIZE; + + tp_vec_nr++; + + return true; +} + +void arch_jump_label_transform_apply(void) +{ + if (!tp_vec_nr) + return; + + mutex_lock(&text_mutex); + text_poke_bp_batch(tp_vec, tp_vec_nr); + mutex_unlock(&text_mutex); + + tp_vec_nr = 0; +} + static enum { JL_STATE_START, JL_STATE_NO_UPDATE, diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 7670ac2bda3a..edaa30b20841 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -67,33 +67,18 @@ static const struct file_operations fops_setup_data = { .llseek = default_llseek, }; -static int __init +static void __init create_setup_data_node(struct dentry *parent, int no, struct setup_data_node *node) { - struct dentry *d, *type, *data; + struct dentry *d; char buf[16]; sprintf(buf, "%d", no); d = debugfs_create_dir(buf, parent); - if (!d) - return -ENOMEM; - - type = debugfs_create_x32("type", S_IRUGO, d, &node->type); - if (!type) - goto err_dir; - - data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); - if (!data) - goto err_type; - return 0; - -err_type: - debugfs_remove(type); -err_dir: - debugfs_remove(d); - return -ENOMEM; + debugfs_create_x32("type", S_IRUGO, d, &node->type); + debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); } static int __init create_setup_data_nodes(struct dentry *parent) @@ -106,8 +91,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) int no = 0; d = debugfs_create_dir("setup_data", parent); - if (!d) - return -ENOMEM; pa_data = boot_params.hdr.setup_data; @@ -128,19 +111,17 @@ static int __init create_setup_data_nodes(struct dentry *parent) node->paddr = pa_data; node->type = data->type; node->len = data->len; - error = create_setup_data_node(d, no, node); + create_setup_data_node(d, no, node); pa_data = data->next; memunmap(data); - if (error) - goto err_dir; no++; } return 0; err_dir: - debugfs_remove(d); + debugfs_remove_recursive(d); return error; } @@ -151,35 +132,18 @@ static struct debugfs_blob_wrapper boot_params_blob = { static int __init boot_params_kdebugfs_init(void) { - struct dentry *dbp, *version, *data; - int error = -ENOMEM; + struct dentry *dbp; + int error; dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); - if (!dbp) - return -ENOMEM; - - version = debugfs_create_x16("version", S_IRUGO, dbp, - &boot_params.hdr.version); - if (!version) - goto err_dir; - data = debugfs_create_blob("data", S_IRUGO, dbp, - &boot_params_blob); - if (!data) - goto err_version; + debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version); + debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob); error = create_setup_data_nodes(dbp); if (error) - goto err_data; + debugfs_remove_recursive(dbp); - return 0; - -err_data: - debugfs_remove(data); -err_version: - debugfs_remove(version); -err_dir: - debugfs_remove(dbp); return error; } #endif /* CONFIG_DEBUG_BOOT_PARAMS */ @@ -189,8 +153,6 @@ static int __init arch_kdebugfs_init(void) int error = 0; arch_debugfs_dir = debugfs_create_dir("x86", NULL); - if (!arch_debugfs_dir) - return -ENOMEM; #ifdef CONFIG_DEBUG_BOOT_PARAMS error = boot_params_kdebugfs_init(); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f03237e3f192..5ebcd02cbca7 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -319,6 +319,11 @@ static int bzImage64_probe(const char *buf, unsigned long len) return ret; } + if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) { + pr_err("bzImage cannot handle 5-level paging mode.\n"); + return ret; + } + /* I've got a bzImage */ pr_debug("It's a relocatable bzImage64\n"); ret = 0; @@ -414,7 +419,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_offset = params_cmdline_sz; efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); - /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + /* Copy setup header onto bootparams. Documentation/x86/boot.rst */ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; /* Is there a limit on setup header size? */ diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 6690c5652aeb..23297ea64f5f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -118,14 +118,6 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) #ifdef CONFIG_X86_32 switch (regno) { - case GDB_SS: - if (!user_mode(regs)) - *(unsigned long *)mem = __KERNEL_DS; - break; - case GDB_SP: - if (!user_mode(regs)) - *(unsigned long *)mem = kernel_stack_pointer(regs); - break; case GDB_GS: case GDB_FS: *(unsigned long *)mem = 0xFFFF; diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 2b949f4fd4d8..7d3a2e2daf01 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -5,15 +5,10 @@ /* Kprobes and Optprobes common header */ #include <asm/asm.h> - -#ifdef CONFIG_FRAME_POINTER -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ - " mov %" _ASM_SP ", %" _ASM_BP "\n" -#else -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" -#endif +#include <asm/frame.h> #ifdef CONFIG_X86_64 + #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ " subq $24, %rsp\n" \ @@ -27,11 +22,13 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - SAVE_RBP_STRING \ + " pushq %rbp\n" \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ - " pushq %r15\n" + " pushq %r15\n" \ + ENCODE_FRAME_POINTER + #define RESTORE_REGS_STRING \ " popq %r15\n" \ " popq %r14\n" \ @@ -51,19 +48,22 @@ /* Skip orig_ax, ip, cs */ \ " addq $24, %rsp\n" #else + #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax and gs. */ \ - " subl $16, %esp\n" \ + " subl $4*4, %esp\n" \ " pushl %fs\n" \ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - SAVE_RBP_STRING \ + " pushl %ebp\n" \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ " pushl %ecx\n" \ - " pushl %ebx\n" + " pushl %ebx\n" \ + ENCODE_FRAME_POINTER + #define RESTORE_REGS_STRING \ " popl %ebx\n" \ " popl %ecx\n" \ @@ -72,8 +72,8 @@ " popl %edi\n" \ " popl %ebp\n" \ " popl %eax\n" \ - /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ - " addl $24, %esp\n" + /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\ + " addl $7*4, %esp\n" #endif /* Ensure if the instruction can be boostable */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6afd8061dbae..0e0b08008b5a 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -56,7 +56,7 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) +#define stack_addr(regs) ((unsigned long *)regs->sp) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -718,29 +718,27 @@ asm( ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" -#ifdef CONFIG_X86_64 /* We don't bother saving the ss register */ +#ifdef CONFIG_X86_64 " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ - " movq %rax, 152(%rsp)\n" + " movq %rax, 19*8(%rsp)\n" RESTORE_REGS_STRING " popfq\n" #else - " pushf\n" + " pushl %esp\n" + " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" - /* Move flags to cs */ - " movl 56(%esp), %edx\n" - " movl %edx, 52(%esp)\n" - /* Replace saved flags with true return address. */ - " movl %eax, 56(%esp)\n" + /* Replace saved sp with true return address. */ + " movl %eax, 15*4(%esp)\n" RESTORE_REGS_STRING - " popf\n" + " popfl\n" #endif " ret\n" ".size kretprobe_trampoline, .-kretprobe_trampoline\n" @@ -781,16 +779,13 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ -#ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; - /* On x86-64, we use pt_regs->sp for return address holder. */ - frame_pointer = ®s->sp; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); +#ifdef CONFIG_X86_32 + regs->cs |= get_kernel_rpl(); regs->gs = 0; - /* On x86-32, we use pt_regs->flags for return address holder. */ - frame_pointer = ®s->flags; #endif + /* We use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -813,7 +808,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) continue; /* * Return probes must be pushed on this hash list correct - * order (same as return order) so that it can be poped + * order (same as return order) so that it can be popped * correctly. However, if we find it is pushed it incorrect * order, this means we find a function which should not be * probed, because the wrong order entry is pushed on the diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7c361a24c6df..9d4aedece363 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -102,14 +102,15 @@ asm ( "optprobe_template_call:\n" ASM_NOP5 /* Move flags to rsp */ - " movq 144(%rsp), %rdx\n" - " movq %rdx, 152(%rsp)\n" + " movq 18*8(%rsp), %rdx\n" + " movq %rdx, 19*8(%rsp)\n" RESTORE_REGS_STRING /* Skip flags entry */ " addq $8, %rsp\n" " popfq\n" #else /* CONFIG_X86_32 */ - " pushf\n" + " pushl %esp\n" + " pushfl\n" SAVE_REGS_STRING " movl %esp, %edx\n" ".global optprobe_template_val\n" @@ -118,9 +119,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 + /* Move flags into esp */ + " movl 14*4(%esp), %edx\n" + " movl %edx, 15*4(%esp)\n" RESTORE_REGS_STRING - " addl $4, %esp\n" /* skip cs */ - " popf\n" + /* Skip flags entry */ + " addl $4, %esp\n" + " popfl\n" #endif ".global optprobe_template_end\n" "optprobe_template_end:\n" @@ -152,10 +157,9 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); /* Save skipped registers */ -#ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); +#ifdef CONFIG_X86_32 + regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; @@ -418,7 +422,7 @@ err: void arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - u8 insn_buf[RELATIVEJUMP_SIZE]; + u8 insn_buff[RELATIVEJUMP_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { s32 rel = (s32)((long)op->optinsn.insn - @@ -430,10 +434,10 @@ void arch_optimize_kprobes(struct list_head *oplist) memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, RELATIVE_ADDR_SIZE); - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; + insn_buff[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, op->optinsn.insn); list_del_init(&op->list); @@ -443,12 +447,12 @@ void arch_optimize_kprobes(struct list_head *oplist) /* Replace a relative jump with a breakpoint (int3). */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - u8 insn_buf[RELATIVEJUMP_SIZE]; + u8 insn_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ - insn_buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + insn_buff[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, op->optinsn.insn); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5169b8cc35bb..82caf01b63dd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -527,6 +527,21 @@ static void kvm_setup_pv_ipi(void) pr_info("KVM setup pv IPIs\n"); } +static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) +{ + int cpu; + + native_send_call_func_ipi(mask); + + /* Make sure other vCPUs get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (vcpu_is_preempted(cpu)) { + kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); + break; + } + } +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -638,6 +653,12 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; + pr_info("KVM setup pv sched yield\n"); + } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index d7be2376ac0b..5dcd438ad8f2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -16,6 +16,7 @@ #include <linux/io.h> #include <linux/suspend.h> #include <linux/vmalloc.h> +#include <linux/efi.h> #include <asm/init.h> #include <asm/pgtable.h> @@ -27,6 +28,55 @@ #include <asm/setup.h> #include <asm/set_memory.h> +#ifdef CONFIG_ACPI +/* + * Used while adding mapping for ACPI tables. + * Can be reused when other iomem regions need be mapped + */ +struct init_pgtable_data { + struct x86_mapping_info *info; + pgd_t *level4p; +}; + +static int mem_region_callback(struct resource *res, void *arg) +{ + struct init_pgtable_data *data = arg; + unsigned long mstart, mend; + + mstart = res->start; + mend = mstart + resource_size(res) - 1; + + return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend); +} + +static int +map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) +{ + struct init_pgtable_data data; + unsigned long flags; + int ret; + + data.info = info; + data.level4p = level4p; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + /* ACPI tables could be located in ACPI Non-volatile Storage region */ + ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + return 0; +} +#else +static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } +#endif + #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, @@ -34,6 +84,31 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { }; #endif +static int +map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) +{ +#ifdef CONFIG_EFI + unsigned long mstart, mend; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + mstart = (boot_params.efi_info.efi_systab | + ((u64)boot_params.efi_info.efi_systab_hi<<32)); + + if (efi_enabled(EFI_64BIT)) + mend = mstart + sizeof(efi_system_table_64_t); + else + mend = mstart + sizeof(efi_system_table_32_t); + + if (!mstart) + return 0; + + return kernel_ident_mapping_init(info, level4p, mstart, mend); +#endif + return 0; +} + static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.p4d); @@ -48,12 +123,13 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; + unsigned long vaddr, paddr; + int result = -ENOMEM; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr, paddr; - int result = -ENOMEM; vaddr = (unsigned long)relocate_kernel; paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); @@ -90,7 +166,11 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); + + if (sev_active()) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); return 0; err: return result; @@ -127,6 +207,11 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); + if (sev_active()) { + info.page_flag |= _PAGE_ENC; + info.kernpg_flag |= _PAGE_ENC; + } + if (direct_gbpages) info.direct_gbpages = true; @@ -157,6 +242,18 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return result; } + /* + * Prepare EFI systab and ACPI tables for kexec kernel since they are + * not covered by pfn_mapped. + */ + result = map_efi_systab(&info, level4p); + if (result) + return result; + + result = map_acpi_tables(&info, level4p); + if (result) + return result; + return init_transition_pgtable(image, level4p); } @@ -557,8 +654,20 @@ void arch_kexec_unprotect_crashkres(void) kexec_mark_crashkres(false); } +/* + * During a traditional boot under SME, SME will encrypt the kernel, + * so the SME kexec kernel also needs to be un-encrypted in order to + * replicate a normal SME boot. + * + * During a traditional boot under SEV, the kernel has already been + * loaded encrypted, so the SEV kexec kernel needs to be encrypted in + * order to replicate a normal SEV boot. + */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { + if (sev_active()) + return 0; + /* * If SME is active we need to be sure that kexec pages are * not encrypted because when we boot to the new kernel the @@ -569,6 +678,9 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { + if (sev_active()) + return; + /* * If SME is active we need to reset the pages back to being * an encrypted mapping before freeing them. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 06f6bb48d018..98039d7fb998 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -58,24 +58,24 @@ struct branch { u32 delta; } __attribute__((packed)); -static unsigned paravirt_patch_call(void *insnbuf, const void *target, +static unsigned paravirt_patch_call(void *insn_buff, const void *target, unsigned long addr, unsigned len) { - struct branch *b = insnbuf; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ + const int call_len = 5; + struct branch *b = insn_buff; + unsigned long delta = (unsigned long)target - (addr+call_len); + + if (len < call_len) { + pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr); + /* Kernel might not be viable if patching fails, bail out: */ + BUG_ON(1); } b->opcode = 0xe8; /* call */ b->delta = delta; - BUILD_BUG_ON(sizeof(*b) != 5); + BUILD_BUG_ON(sizeof(*b) != call_len); - return 5; + return call_len; } #ifdef CONFIG_PARAVIRT_XXL @@ -85,10 +85,10 @@ u64 notrace _paravirt_ident_64(u64 x) return x; } -static unsigned paravirt_patch_jmp(void *insnbuf, const void *target, +static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, unsigned long addr, unsigned len) { - struct branch *b = insnbuf; + struct branch *b = insn_buff; unsigned long delta = (unsigned long)target - (addr+5); if (len < 5) { @@ -113,7 +113,7 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insnbuf, +unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len) { /* @@ -125,36 +125,36 @@ unsigned paravirt_patch_default(u8 type, void *insnbuf, if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); + ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == _paravirt_nop) ret = 0; #ifdef CONFIG_PARAVIRT_XXL /* identity functions just return their single argument */ else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insnbuf, len); + ret = paravirt_patch_ident_64(insn_buff, len); else if (type == PARAVIRT_PATCH(cpu.iret) || type == PARAVIRT_PATCH(cpu.usergs_sysret64)) /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); + ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); #endif else /* Otherwise call the function. */ - ret = paravirt_patch_call(insnbuf, opfunc, addr, len); + ret = paravirt_patch_call(insn_buff, opfunc, addr, len); return ret; } -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, +unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end) { unsigned insn_len = end - start; - if (insn_len > len || start == NULL) - insn_len = len; - else - memcpy(insnbuf, start, insn_len); + /* Alternative instruction is too large for the patch site and we cannot continue: */ + BUG_ON(insn_len > len || start == NULL); + + memcpy(insn_buff, start, insn_len); return insn_len; } diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c new file mode 100644 index 000000000000..3eff63c090d2 --- /dev/null +++ b/arch/x86/kernel/paravirt_patch.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/stringify.h> + +#include <asm/paravirt.h> +#include <asm/asm-offsets.h> + +#define PSTART(d, m) \ + patch_data_##d.m + +#define PEND(d, m) \ + (PSTART(d, m) + sizeof(patch_data_##d.m)) + +#define PATCH(d, m, insn_buff, len) \ + paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) + +#define PATCH_CASE(ops, m, data, insn_buff, len) \ + case PARAVIRT_PATCH(ops.m): \ + return PATCH(data, ops##_##m, insn_buff, len) + +#ifdef CONFIG_PARAVIRT_XXL +struct patch_xxl { + const unsigned char irq_irq_disable[1]; + const unsigned char irq_irq_enable[1]; + const unsigned char irq_save_fl[2]; + const unsigned char mmu_read_cr2[3]; + const unsigned char mmu_read_cr3[3]; + const unsigned char mmu_write_cr3[3]; + const unsigned char irq_restore_fl[2]; +# ifdef CONFIG_X86_64 + const unsigned char cpu_wbinvd[2]; + const unsigned char cpu_usergs_sysret64[6]; + const unsigned char cpu_swapgs[3]; + const unsigned char mov64[3]; +# else + const unsigned char cpu_iret[1]; +# endif +}; + +static const struct patch_xxl patch_data_xxl = { + .irq_irq_disable = { 0xfa }, // cli + .irq_irq_enable = { 0xfb }, // sti + .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax + .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax + .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax +# ifdef CONFIG_X86_64 + .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 + .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq + .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd + .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, + 0x48, 0x0f, 0x07 }, // swapgs; sysretq + .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs + .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax +# else + .mmu_write_cr3 = { 0x0f, 0x22, 0xd8 }, // mov %eax, %cr3 + .irq_restore_fl = { 0x50, 0x9d }, // push %eax; popf + .cpu_iret = { 0xcf }, // iret +# endif +}; + +unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) +{ +#ifdef CONFIG_X86_64 + return PATCH(xxl, mov64, insn_buff, len); +#endif + return 0; +} +# endif /* CONFIG_PARAVIRT_XXL */ + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +struct patch_lock { + unsigned char queued_spin_unlock[3]; + unsigned char vcpu_is_preempted[2]; +}; + +static const struct patch_lock patch_data_lock = { + .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax + +# ifdef CONFIG_X86_64 + .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) +# else + .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) +# endif +}; +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) +{ + switch (type) { + +#ifdef CONFIG_PARAVIRT_XXL + PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); + PATCH_CASE(irq, save_fl, xxl, insn_buff, len); + PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); + PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); + + PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); + PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); + PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); + +# ifdef CONFIG_X86_64 + PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); + PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); + PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); +# else + PATCH_CASE(cpu, iret, xxl, insn_buff, len); +# endif +#endif + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + case PARAVIRT_PATCH(lock.queued_spin_unlock): + if (pv_is_native_spin_unlock()) + return PATCH(lock, queued_spin_unlock, insn_buff, len); + break; + + case PARAVIRT_PATCH(lock.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) + return PATCH(lock, vcpu_is_preempted, insn_buff, len); + break; +#endif + default: + break; + } + + return paravirt_patch_default(type, insn_buff, addr, len); +} diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c deleted file mode 100644 index de138d3912e4..000000000000 --- a/arch/x86/kernel/paravirt_patch_32.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <asm/paravirt.h> - -#ifdef CONFIG_PARAVIRT_XXL -DEF_NATIVE(irq, irq_disable, "cli"); -DEF_NATIVE(irq, irq_enable, "sti"); -DEF_NATIVE(irq, restore_fl, "push %eax; popf"); -DEF_NATIVE(irq, save_fl, "pushf; pop %eax"); -DEF_NATIVE(cpu, iret, "iret"); -DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax"); - -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) -{ - /* arg in %edx:%eax, return in %edx:%eax */ - return 0; -} -#endif - -#if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)"); -DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); -#endif - -extern bool pv_is_native_spin_unlock(void); -extern bool pv_is_native_vcpu_is_preempted(void); - -unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) -{ -#define PATCH_SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x) - - switch (type) { -#ifdef CONFIG_PARAVIRT_XXL - PATCH_SITE(irq, irq_disable); - PATCH_SITE(irq, irq_enable); - PATCH_SITE(irq, restore_fl); - PATCH_SITE(irq, save_fl); - PATCH_SITE(cpu, iret); - PATCH_SITE(mmu, read_cr2); - PATCH_SITE(mmu, read_cr3); - PATCH_SITE(mmu, write_cr3); -#endif -#if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return paravirt_patch_insns(ibuf, len, - start_lock_queued_spin_unlock, - end_lock_queued_spin_unlock); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return paravirt_patch_insns(ibuf, len, - start_lock_vcpu_is_preempted, - end_lock_vcpu_is_preempted); - break; -#endif - - default: - break; - } -#undef PATCH_SITE - return paravirt_patch_default(type, ibuf, addr, len); -} diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c deleted file mode 100644 index 9d9e04b31077..000000000000 --- a/arch/x86/kernel/paravirt_patch_64.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> -#include <linux/stringify.h> - -#ifdef CONFIG_PARAVIRT_XXL -DEF_NATIVE(irq, irq_disable, "cli"); -DEF_NATIVE(irq, irq_enable, "sti"); -DEF_NATIVE(irq, restore_fl, "pushq %rdi; popfq"); -DEF_NATIVE(irq, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(mmu, read_cr2, "movq %cr2, %rax"); -DEF_NATIVE(mmu, read_cr3, "movq %cr3, %rax"); -DEF_NATIVE(mmu, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(cpu, wbinvd, "wbinvd"); - -DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq"); -DEF_NATIVE(cpu, swapgs, "swapgs"); -DEF_NATIVE(, mov64, "mov %rdi, %rax"); - -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) -{ - return paravirt_patch_insns(insnbuf, len, - start__mov64, end__mov64); -} -#endif - -#if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)"); -DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); -#endif - -extern bool pv_is_native_spin_unlock(void); -extern bool pv_is_native_vcpu_is_preempted(void); - -unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) -{ -#define PATCH_SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x) - - switch (type) { -#ifdef CONFIG_PARAVIRT_XXL - PATCH_SITE(irq, restore_fl); - PATCH_SITE(irq, save_fl); - PATCH_SITE(irq, irq_enable); - PATCH_SITE(irq, irq_disable); - PATCH_SITE(cpu, usergs_sysret64); - PATCH_SITE(cpu, swapgs); - PATCH_SITE(cpu, wbinvd); - PATCH_SITE(mmu, read_cr2); - PATCH_SITE(mmu, read_cr3); - PATCH_SITE(mmu, write_cr3); -#endif -#if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return paravirt_patch_insns(ibuf, len, - start_lock_queued_spin_unlock, - end_lock_queued_spin_unlock); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return paravirt_patch_insns(ibuf, len, - start_lock_vcpu_is_preempted, - end_lock_vcpu_is_preempted); - break; -#endif - - default: - break; - } -#undef PATCH_SITE - return paravirt_patch_default(type, ibuf, addr, len); -} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dcd272dbd0a9..f62b498b18fb 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -70,7 +70,7 @@ void __init pci_iommu_alloc(void) } /* - * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel + * See <Documentation/x86/x86_64/boot-options.rst> for the iommu kernel * parameter documentation. */ static __init int iommu_setup(char *p) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2399e910d109..b8ceec4974fe 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -62,27 +62,21 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long sp; - unsigned short ss, gs; + unsigned short gs; - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss; + if (user_mode(regs)) gs = get_user_gs(regs); - } else { - sp = kernel_stack_pointer(regs); - savesegment(ss, ss); + else savesegment(gs, gs); - } show_ip(regs, KERN_DEFAULT); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, sp); + regs->si, regs->di, regs->bp, regs->sp); printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); if (mode != SHOW_REGS_ALL) return; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a166c960bc9e..71691a8310e7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -25,6 +25,7 @@ #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -154,35 +155,6 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * Now, if the stack is empty, '®s->sp' is out of range. In this - * case we try to take the previous stack. To always return a non-null - * stack pointer we fall back to regs as stack if no previous stack - * exists. - * - * This is valid only for kernel mode traps. - */ -unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ - unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); - unsigned long sp = (unsigned long)®s->sp; - u32 *prev_esp; - - if (context == (sp & ~(THREAD_SIZE - 1))) - return sp; - - prev_esp = (u32 *)(context); - if (*prev_esp) - return (unsigned long)*prev_esp; - - return (unsigned long)regs; -} -EXPORT_SYMBOL_GPL(kernel_stack_pointer); - static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); @@ -397,22 +369,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } @@ -645,7 +607,8 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp = thread->ptrace_bps[n]; + int index = array_index_nospec(n, HBP_NUM); + struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; @@ -747,9 +710,6 @@ static int ioperm_get(struct task_struct *target, void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -#endif } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1361,18 +1321,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { + struct task_struct *tsk = current; + tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, si_code, - user_mode(regs) ? (void __user *)regs->ip : NULL, tsk); + user_mode(regs) ? (void __user *)regs->ip : NULL); } void user_single_step_report(struct pt_regs *regs) { - send_sigtrap(current, regs, 0, TRAP_BRKPT); + send_sigtrap(regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 0ff3e294d0e5..10125358b9c4 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -3,6 +3,7 @@ */ +#include <linux/clocksource.h> #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/notifier.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 08a5f4a131f5..bbe35bf879f5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -453,15 +453,24 @@ static void __init memblock_x86_reserve_range_setup_data(void) #define CRASH_ALIGN SZ_16M /* - * Keep the crash kernel below this limit. On 32 bits earlier kernels - * would limit the kernel to the low 512 MiB due to mapping restrictions. + * Keep the crash kernel below this limit. + * + * On 32 bits earlier kernels would limit the kernel to the low 512 MiB + * due to mapping restrictions. + * + * On 64bit, kdump kernel need be restricted to be under 64TB, which is + * the upper limit of system RAM in 4-level paing mode. Since the kdump + * jumping could be from 5-level to 4-level, the jumping will fail if + * kernel is put above 64TB, and there's no way to detect the paging mode + * of the kernel which will be loaded for dumping during the 1st kernel + * bootup. */ #ifdef CONFIG_X86_32 # define CRASH_ADDR_LOW_MAX SZ_512M # define CRASH_ADDR_HIGH_MAX SZ_512M #else # define CRASH_ADDR_LOW_MAX SZ_4G -# define CRASH_ADDR_HIGH_MAX MAXMEM +# define CRASH_ADDR_HIGH_MAX SZ_64T #endif static int __init reserve_crashkernel_low(void) @@ -827,8 +836,14 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ memblock_reserve(__pa_symbol(_text), - (unsigned long)__bss_stop - (unsigned long)_text); + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * Make sure page 0 is always reserved because on systems with diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 364813cea647..8eb7193e158d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -391,7 +391,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(&frame->uc, &frame->puc); /* Create the ucontext. */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (static_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); @@ -857,7 +857,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) pr_cont("\n"); } - force_sig(SIGSEGV, me); + force_sig(SIGSEGV); } #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 4693e2f3a03e..96421f97e75c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -144,7 +144,7 @@ void native_send_call_func_ipi(const struct cpumask *mask) } cpumask_copy(allbutself, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), allbutself); + __cpumask_clear_cpu(smp_processor_id(), allbutself); if (cpumask_equal(mask, allbutself) && cpumask_equal(cpu_online_mask, cpu_callout_mask)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 362dd8953f48..259d1d2be076 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -89,6 +89,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +/* representing HT, core, and die siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); +EXPORT_PER_CPU_SYMBOL(cpu_die_map); + DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ @@ -99,6 +103,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; +static unsigned int logical_die __read_mostly; /* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1; @@ -210,17 +215,11 @@ static void notrace start_secondary(void *unused) * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - __write_cr4(__read_cr4() | X86_CR4_PCIDE); + cr4_init(); #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); - /* - * Initialize the CR4 shadow before doing anything that could - * try to read it. - */ - cr4_init_shadow(); __flush_tlb_all(); #endif load_current_idt(); @@ -300,6 +299,26 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) return -1; } EXPORT_SYMBOL(topology_phys_to_logical_pkg); +/** + * topology_phys_to_logical_die - Map a physical die id to logical + * + * Returns logical die id or -1 if not found + */ +int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) +{ + int cpu; + int proc_id = cpu_data(cur_cpu).phys_proc_id; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->cpu_die_id == die_id && + c->phys_proc_id == proc_id) + return c->logical_die_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_die); /** * topology_update_package_map - Update the physical to logical package map @@ -324,6 +343,29 @@ found: cpu_data(cpu).logical_proc_id = new; return 0; } +/** + * topology_update_die_map - Update the physical to logical die map + * @die: The die id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_die_map(unsigned int die, unsigned int cpu) +{ + int new; + + /* Already available somewhere? */ + new = topology_phys_to_logical_die(die, cpu); + if (new >= 0) + goto found; + + new = logical_die++; + if (new != die) { + pr_info("CPU %u Converting physical %u to logical die %u\n", + cpu, die, new); + } +found: + cpu_data(cpu).logical_die_id = new; + return 0; +} void __init smp_store_boot_cpu_info(void) { @@ -333,6 +375,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; topology_update_package_map(c->phys_proc_id, id); + topology_update_die_map(c->cpu_die_id, id); c->initialized = true; } @@ -387,6 +430,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { if (c->cpu_core_id == o->cpu_core_id) return topology_sane(c, o, "smt"); @@ -398,6 +442,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && c->cpu_core_id == o->cpu_core_id) { return topology_sane(c, o, "smt"); } @@ -460,6 +505,15 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if ((c->phys_proc_id == o->phys_proc_id) && + (c->cpu_die_id == o->cpu_die_id)) + return true; + return false; +} + + #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { @@ -522,6 +576,7 @@ void set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); + cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; return; } @@ -570,6 +625,9 @@ void set_cpu_sibling_map(int cpu) } if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; + + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); } threads = cpumask_weight(topology_sibling_cpumask(cpu)); @@ -1174,6 +1232,7 @@ static __init void disable_smp(void) physid_set_mask_of_physid(0, &phys_cpu_present_map); cpumask_set_cpu(0, topology_sibling_cpumask(0)); cpumask_set_cpu(0, topology_core_cpumask(0)); + cpumask_set_cpu(0, topology_die_cpumask(0)); } /* @@ -1269,6 +1328,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } @@ -1489,6 +1549,8 @@ static void remove_siblinginfo(int cpu) cpu_data(sibling).booted_cores--; } + for_each_cpu(sibling, topology_die_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); for_each_cpu(sibling, topology_sibling_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) @@ -1496,6 +1558,7 @@ static void remove_siblinginfo(int cpu) cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); + cpumask_clear(topology_die_cpumask(cpu)); c->cpu_core_id = 0; c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2abf27d7df6b..4f36d3241faf 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -129,11 +129,9 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, break; if ((unsigned long)fp < regs->sp) break; - if (frame.ret_addr) { - if (!consume_entry(cookie, frame.ret_addr, false)) - return; - } - if (fp == frame.next_fp) + if (!frame.ret_addr) + break; + if (!consume_entry(cookie, frame.ret_addr, false)) break; fp = frame.next_fp; } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 0e14f6c0d35e..7ce29cee9f9e 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -37,8 +37,7 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = - (unsigned long *)kernel_stack_pointer(regs); + unsigned long *sp = (unsigned long *)regs->sp; /* * Return address is either directly at stack pointer * or above a saved flags. Eflags has bits 22-31 zero, @@ -82,8 +81,11 @@ static void __init setup_default_timer_irq(void) /* Default timer init function */ void __init hpet_time_init(void) { - if (!hpet_enable()) - setup_pit_timer(); + if (!hpet_enable()) { + if (!pit_timer_init()) + return; + } + setup_default_timer_irq(); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index a5b802a12212..71d3fef1edc9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -5,6 +5,7 @@ #include <linux/user.h> #include <linux/regset.h> #include <linux/syscalls.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/desc.h> @@ -220,6 +221,7 @@ int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *u_info) { struct user_desc info; + int index; if (idx == -1 && get_user(idx, &u_info->entry_number)) return -EFAULT; @@ -227,8 +229,11 @@ int do_get_thread_area(struct task_struct *p, int idx, if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; - fill_user_desc(&info, idx, - &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + index = idx - GDT_ENTRY_TLS_MIN; + index = array_index_nospec(index, + GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1); + + fill_user_desc(&info, idx, &p->thread.tls_array[index]); if (copy_to_user(u_info, &info, sizeof(info))) return -EFAULT; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8b6d03e55d2f..87095a477154 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -254,9 +254,9 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) - force_sig(signr, tsk); + force_sig(signr); else - force_sig_fault(signr, sicode, addr, tsk); + force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); @@ -566,7 +566,7 @@ do_general_protection(struct pt_regs *regs, long error_code) show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV, tsk); + force_sig(SIGSEGV); } NOKPROBE_SYMBOL(do_general_protection); @@ -805,7 +805,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) } si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(tsk, regs, error_code, si_code); + send_sigtrap(regs, error_code, si_code); cond_local_irq_disable(regs); debug_stack_usage_dec(); @@ -856,7 +856,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; force_sig_fault(SIGFPE, si_code, - (void __user *)uprobe_get_trap_addr(regs), task); + (void __user *)uprobe_get_trap_addr(regs)); } dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0b29e58f288e..57d87f79558f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -59,7 +59,7 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -76,7 +76,7 @@ void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -void __always_inline cyc2ns_read_end(void) +__always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -632,31 +632,38 @@ unsigned long native_calibrate_tsc(void) crystal_khz = ecx_hz / 1000; - if (crystal_khz == 0) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - crystal_khz = 24000; /* 24.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT_X: - crystal_khz = 25000; /* 25.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT: - crystal_khz = 19200; /* 19.2 MHz */ - break; - } - } + /* + * Denverton SoCs don't report crystal clock, and also don't support + * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal + * clock. + */ + if (crystal_khz == 0 && + boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X) + crystal_khz = 25000; - if (crystal_khz == 0) - return 0; /* - * TSC frequency determined by CPUID is a "hardware reported" + * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + if (crystal_khz != 0) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Some Intel SoCs like Skylake and Kabylake don't report the crystal + * clock, but we can easily calculate it to a high degree of accuracy + * by considering the crystal ratio and the CPU speed. + */ + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { + unsigned int eax_base_mhz, ebx, ecx, edx; + + cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); + crystal_khz = eax_base_mhz * 1000 * + eax_denominator / ebx_numerator; + } + + if (crystal_khz == 0) + return 0; /* * For Atom SoCs TSC is the only reliable clocksource. @@ -665,6 +672,16 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * The local APIC appears to be fed by the core crystal clock + * (which sounds entirely sensible). We can set the global + * lapic_timer_period here to avoid having to calibrate the APIC + * timer later. + */ + lapic_timer_period = crystal_khz * 1000 / HZ; +#endif + return crystal_khz * ebx_numerator / eax_denominator; } diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 3d0e9aeea7c8..067858fe4db8 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -71,7 +71,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { /* * MSR-based CPU/TSC frequency discovery for certain CPUs. * - * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy + * Set global "lapic_timer_period" to bus_clock_cycles/jiffy * Return processor base frequency in KHz, or 0 on failure. */ unsigned long cpu_khz_from_msr(void) @@ -104,7 +104,7 @@ unsigned long cpu_khz_from_msr(void) res = freq * ratio; #ifdef CONFIG_X86_LOCAL_APIC - lapic_timer_frequency = (freq * 1000) / HZ; + lapic_timer_period = (freq * 1000) / HZ; #endif /* diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f8f3cfda01ae..5b345add550f 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -277,7 +277,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_fault(SIGSEGV, SEGV_MAPERR, addr, tsk); + force_sig_fault(SIGSEGV, SEGV_MAPERR, addr); if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) return; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 6106760de716..a224b5ab103f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -70,15 +70,6 @@ static void unwind_dump(struct unwind_state *state) } } -static size_t regs_size(struct pt_regs *regs) -{ - /* x86_32 regs from kernel mode are two words shorter: */ - if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) - return sizeof(*regs) - 2*sizeof(long); - - return sizeof(*regs); -} - static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; @@ -198,12 +189,6 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif -#ifdef CONFIG_X86_32 -#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) -#else -#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) -#endif - static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -214,7 +199,7 @@ static bool update_stack_state(struct unwind_state *state, size_t len; if (state->regs) - prev_frame_end = (void *)state->regs + regs_size(state->regs); + prev_frame_end = (void *)state->regs + sizeof(*state->regs); else prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; @@ -222,7 +207,7 @@ static bool update_stack_state(struct unwind_state *state, regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = KERNEL_REGS_SIZE; + len = sizeof(*regs); state->got_irq = true; } else { frame = next_bp; @@ -246,14 +231,6 @@ static bool update_stack_state(struct unwind_state *state, frame < prev_frame_end) return false; - /* - * On 32-bit with user mode regs, make sure the last two regs are safe - * to access: - */ - if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && - !on_stack(info, frame, len + 2*sizeof(long))) - return false; - /* Move state to the next frame: */ if (regs) { state->regs = regs; @@ -412,10 +389,9 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, * Pretend that the frame is complete and that BP points to it, but save * the real BP so that we can use it when looking for the next frame. */ - if (regs && regs->ip == 0 && - (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) { state->next_bp = bp; - bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + bp = ((unsigned long *)regs->sp) - 1; } /* Initialize stack info and make sure the frame data is accessible: */ diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 72b997eaa1fc..332ae6530fa8 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -598,7 +598,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, goto done; state->ip = regs->ip; - state->sp = kernel_stack_pointer(regs); + state->sp = regs->sp; state->bp = regs->bp; state->regs = regs; state->full_regs = true; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 918b5092a85f..d8359ebeea70 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1074,7 +1074,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } return -1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 6a38717d179c..a76c12b38e92 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -583,7 +583,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) return 1; /* we let this handle by the calling routine */ current->thread.trap_nr = trapno; current->thread.error_code = error_code; - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); return 0; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0850b5149345..e2feacf921a0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -141,10 +141,10 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif - } :text = 0x9090 - /* End of text section */ - _etext = .; + /* End of text section */ + _etext = .; + } :text = 0x9090 NOTES :text :note @@ -368,6 +368,14 @@ SECTIONS __bss_stop = .; } + /* + * The memory occupied from _text to here, __end_of_kernel_reserve, is + * automatically reserved in setup_arch(). Anything after here must be + * explicitly reserved using memblock_reserve() or it will be discarded + * and treated as available memory. + */ + __end_of_kernel_reserve = .; + . = ALIGN(PAGE_SIZE); .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; @@ -379,10 +387,34 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Early scratch/workarea section: Lives outside of the kernel proper + * (_text - _end). + * + * Resides after _end because even though the .brk section is after + * __end_of_kernel_reserve, the .brk section is later reserved as a + * part of the kernel. Since it is located after __end_of_kernel_reserve + * it will be discarded and become part of the available memory. As + * such, it can only be used by very early boot code and must not be + * needed afterwards. + * + * Currently used by SME for performing in-place encryption of the + * kernel during boot. Resides on a 2MB boundary to simplify the + * pagetable setup used for SME in-place encryption. + */ + . = ALIGN(HPAGE_SIZE); + .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) { + __init_scratch_begin = .; + *(.init.scratch) + . = ALIGN(HPAGE_SIZE); + __init_scratch_end = .; + } +#endif + STABS_DEBUG DWARF_DEBUG - /* Sections to be discarded */ DISCARDS /DISCARD/ : { *(.eh_frame) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fc042419e670..840e12583b85 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -41,6 +41,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_NO_POLL select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 4992e7c99588..ead681210306 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -134,6 +134,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { + best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + if (best) { + if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT) + best->ecx |= F(MWAIT); + else + best->ecx &= ~F(MWAIT); + } + } + /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); @@ -276,19 +286,38 @@ static void cpuid_mask(u32 *word, int wordnum) *word &= boot_cpu_data.x86_capability[wordnum]; } -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { entry->function = function; entry->index = index; + entry->flags = 0; + cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; + + switch (function) { + case 2: + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + break; + case 4: + case 7: + case 0xb: + case 0xd: + case 0x14: + case 0x8000001d: + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + break; + } } -static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, - u32 func, u32 index, int *nent, int maxnent) +static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, int *nent, int maxnent) { + entry->function = func; + entry->index = 0; + entry->flags = 0; + switch (func) { case 0: entry->eax = 7; @@ -300,21 +329,83 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, break; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - if (index == 0) - entry->ecx = F(RDPID); + entry->eax = 0; + entry->ecx = F(RDPID); ++*nent; default: break; } - entry->function = func; - entry->index = index; - return 0; } -static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) +{ + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; + unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; + unsigned f_la57; + + /* cpuid 7.0.ebx */ + const u32 kvm_cpuid_7_0_ebx_x86_features = + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | + F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; + + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | + F(MD_CLEAR); + + switch (index) { + case 0: + entry->eax = 0; + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); + /* TSC_ADJUST is emulated */ + entry->ebx |= F(TSC_ADJUST); + + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + f_la57 = entry->ecx & F(LA57); + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* Set LA57 based on hardware capability. */ + entry->ecx |= f_la57; + entry->ecx |= f_umip; + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); + + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); + break; + default: + WARN_ON_ONCE(1); + entry->eax = 0; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } +} + +static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, + int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -327,12 +418,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; - unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; - unsigned f_la57 = 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -377,7 +464,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP); + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -385,31 +472,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); - /* cpuid 7.0.ebx */ - const u32 kvm_cpuid_7_0_ebx_x86_features = - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; - /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; - /* cpuid 7.0.ecx*/ - const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | - F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | - F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); - - /* cpuid 7.0.edx*/ - const u32 kvm_cpuid_7_0_edx_x86_features = - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | - F(MD_CLEAR); - /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -418,12 +484,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (*nent >= maxnent) goto out; - do_cpuid_1_ent(entry, function, index); + do_host_cpuid(entry, function, 0); ++*nent; switch (function) { case 0: - entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); + /* Limited to the highest leaf implemented in KVM. */ + entry->eax = min(entry->eax, 0x1fU); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -441,14 +508,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 2: { int t, times = entry->eax & 0xff; - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times; ++t) { if (*nent >= maxnent) goto out; - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + do_host_cpuid(&entry[t], function, 0); ++*nent; } break; @@ -458,7 +523,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 0x8000001d: { int i, cache_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until cache_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -467,9 +531,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, cache_type = entry[i - 1].eax & 0x1f; if (!cache_type) break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + do_host_cpuid(&entry[i], function, i); ++*nent; } break; @@ -480,36 +542,21 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx = 0; entry->edx = 0; break; + /* function 7 has additional index. */ case 7: { - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capability word 9 */ - if (index == 0) { - entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_7_0_EBX); - // TSC_ADJUST is emulated - entry->ebx |= F(TSC_ADJUST); - entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; - f_la57 = entry->ecx & F(LA57); - cpuid_mask(&entry->ecx, CPUID_7_ECX); - /* Set LA57 based on hardware capability. */ - entry->ecx |= f_la57; - entry->ecx |= f_umip; - /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) - entry->ecx &= ~F(PKU); - entry->edx &= kvm_cpuid_7_0_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_7_EDX); - /* - * We emulate ARCH_CAPABILITIES in software even - * if the host doesn't support it. - */ - entry->edx |= F(ARCH_CAPABILITIES); - } else { - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; + int i; + + for (i = 0; ; ) { + do_cpuid_7_mask(&entry[i], i); + if (i == entry->eax) + break; + if (*nent >= maxnent) + goto out; + + ++i; + do_host_cpuid(&entry[i], function, i); + ++*nent; } - entry->eax = 0; break; } case 9: @@ -543,11 +590,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = edx.full; break; } - /* function 0xb has additional index. */ + /* + * Per Intel's SDM, the 0x1f is a superset of 0xb, + * thus they can be handled by common code. + */ + case 0x1f: case 0xb: { int i, level_type; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; ; ++i) { if (*nent >= maxnent) @@ -556,9 +606,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + do_host_cpuid(&entry[i], function, i); ++*nent; } break; @@ -571,7 +619,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx = xstate_required_size(supported, false); entry->ecx = entry->ebx; entry->edx &= supported >> 32; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; if (!supported) break; @@ -580,7 +627,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (*nent >= maxnent) goto out; - do_cpuid_1_ent(&entry[i], function, idx); + do_host_cpuid(&entry[i], function, idx); if (idx == 1) { entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; cpuid_mask(&entry[i].eax, CPUID_D_1_EAX); @@ -597,8 +644,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } entry[i].ecx = 0; entry[i].edx = 0; - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; ++i; } @@ -611,12 +656,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (!f_intel_pt) break; - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (t = 1; t <= times; ++t) { if (*nent >= maxnent) goto out; - do_cpuid_1_ent(&entry[t], function, t); - entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + do_host_cpuid(&entry[t], function, t); ++*nent; } break; @@ -640,7 +683,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | - (1 << KVM_FEATURE_PV_SEND_IPI); + (1 << KVM_FEATURE_PV_SEND_IPI) | + (1 << KVM_FEATURE_POLL_CONTROL) | + (1 << KVM_FEATURE_PV_SCHED_YIELD); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); @@ -730,21 +775,19 @@ out: return r; } -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, - u32 idx, int *nent, int maxnent, unsigned int type) +static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, + int *nent, int maxnent, unsigned int type) { if (type == KVM_GET_EMULATED_CPUID) - return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + return __do_cpuid_func_emulated(entry, func, nent, maxnent); - return __do_cpuid_ent(entry, func, idx, nent, maxnent); + return __do_cpuid_func(entry, func, nent, maxnent); } #undef F struct kvm_cpuid_param { u32 func; - u32 idx; - bool has_leaf_count; bool (*qualifier)(const struct kvm_cpuid_param *param); }; @@ -788,11 +831,10 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, int limit, nent = 0, r = -E2BIG, i; u32 func; static const struct kvm_cpuid_param param[] = { - { .func = 0, .has_leaf_count = true }, - { .func = 0x80000000, .has_leaf_count = true }, - { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, + { .func = 0 }, + { .func = 0x80000000 }, + { .func = 0xC0000000, .qualifier = is_centaur_cpu }, { .func = KVM_CPUID_SIGNATURE }, - { .func = KVM_CPUID_FEATURES }, }; if (cpuid->nent < 1) @@ -816,19 +858,16 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (ent->qualifier && !ent->qualifier(ent)) continue; - r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent, type); + r = do_cpuid_func(&cpuid_entries[nent], ent->func, + &nent, cpuid->nent, type); if (r) goto out_free; - if (!ent->has_leaf_count) - continue; - limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) - r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent, type); + r = do_cpuid_func(&cpuid_entries[nent], func, + &nent, cpuid->nent, type); if (r) goto out_free; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 9a327d5b6d1f..d78a61408243 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, - [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, - [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4a387a235424..8e409ad448f9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4258,7 +4258,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) ulong dr6; ctxt->ops->get_dr(ctxt, 6, &dr6); - dr6 &= ~15; + dr6 &= ~DR_TRAP_BITS; dr6 |= DR6_BD | DR6_RTM; ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index d6519a3aa959..7c6233d37c64 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -102,7 +102,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return mode != KVM_IRQCHIP_NONE; } -bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 924b3bd5a7b7..8ecd48d31800 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -75,7 +75,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_lapic_enabled(vcpu)) { + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { if (!kvm_vector_hashing_enabled()) { if (!lowest) lowest = vcpu; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a21c440ff356..a232e76d8f23 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -69,6 +69,7 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 +#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -85,11 +86,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } -static inline void apic_clear_vector(int vec, void *bitmap) -{ - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int __apic_test_and_set_vector(int vec, void *bitmap) { return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -443,12 +439,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ - apic_clear_vector(vec, apic->regs + APIC_IRR); + kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; - apic_clear_vector(vec, apic->regs + APIC_IRR); + kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); if (apic_search_irr(apic) != -1) apic->irr_pending = true; } @@ -1053,9 +1049,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) - kvm_lapic_set_vector(vector, apic->regs + APIC_TMR); + kvm_lapic_set_vector(vector, + apic->regs + APIC_TMR); else - apic_clear_vector(vector, apic->regs + APIC_TMR); + kvm_lapic_clear_vector(vector, + apic->regs + APIC_TMR); } if (vcpu->arch.apicv_active) @@ -1313,21 +1311,45 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) return container_of(dev, struct kvm_lapic, dev); } +#define APIC_REG_MASK(reg) (1ull << ((reg) >> 4)) +#define APIC_REGS_MASK(first, count) \ + (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) + int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { unsigned char alignment = offset & 0xf; u32 result; /* this bitmask has a bit cleared for each reserved register */ - static const u64 rmask = 0x43ff01ffffffe70cULL; - - if ((alignment + len) > 4) { - apic_debug("KVM_APIC_READ: alignment error %x %d\n", - offset, len); - return 1; - } + u64 valid_reg_mask = + APIC_REG_MASK(APIC_ID) | + APIC_REG_MASK(APIC_LVR) | + APIC_REG_MASK(APIC_TASKPRI) | + APIC_REG_MASK(APIC_PROCPRI) | + APIC_REG_MASK(APIC_LDR) | + APIC_REG_MASK(APIC_DFR) | + APIC_REG_MASK(APIC_SPIV) | + APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) | + APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) | + APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) | + APIC_REG_MASK(APIC_ESR) | + APIC_REG_MASK(APIC_ICR) | + APIC_REG_MASK(APIC_ICR2) | + APIC_REG_MASK(APIC_LVTT) | + APIC_REG_MASK(APIC_LVTTHMR) | + APIC_REG_MASK(APIC_LVTPC) | + APIC_REG_MASK(APIC_LVT0) | + APIC_REG_MASK(APIC_LVT1) | + APIC_REG_MASK(APIC_LVTERR) | + APIC_REG_MASK(APIC_TMICT) | + APIC_REG_MASK(APIC_TMCCT) | + APIC_REG_MASK(APIC_TDCR); + + /* ARBPRI is not valid on x2APIC */ + if (!apic_x2apic_mode(apic)) + valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); - if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { + if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) { apic_debug("KVM_APIC_READ: read reserved register %x\n", offset); return 1; @@ -1499,11 +1521,40 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) } } -void wait_lapic_expire(struct kvm_vcpu *vcpu) +static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, + s64 advance_expire_delta) { struct kvm_lapic *apic = vcpu->arch.apic; u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; - u64 guest_tsc, tsc_deadline, ns; + u64 ns; + + /* too early */ + if (advance_expire_delta < 0) { + ns = -advance_expire_delta * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } else { + /* too late */ + ns = advance_expire_delta * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } + + if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_adjust_done = false; + } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; +} + +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; if (apic->lapic_timer.expired_tscdeadline == 0) return; @@ -1514,34 +1565,15 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); + apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (!apic->lapic_timer.timer_advance_adjust_done) { - /* too early */ - if (guest_tsc < tsc_deadline) { - ns = (tsc_deadline - guest_tsc) * 1000000ULL; - do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); - } else { - /* too late */ - ns = (guest_tsc - tsc_deadline) * 1000000ULL; - do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); - } - if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; - if (unlikely(timer_advance_ns > 5000)) { - timer_advance_ns = 0; - apic->lapic_timer.timer_advance_adjust_done = true; - } - apic->lapic_timer.timer_advance_ns = timer_advance_ns; - } + if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } +EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); static void start_sw_tscdeadline(struct kvm_lapic *apic) { @@ -2014,7 +2046,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, apic_debug("%s: offset 0x%x with length 0x%x, and value is " "0x%x\n", __func__, offset, len, val); - kvm_lapic_reg_write(apic, offset & 0xff0, val); + kvm_lapic_reg_write(apic, offset, val); return 0; } @@ -2311,7 +2343,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; apic->lapic_timer.timer_advance_adjust_done = false; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; @@ -2321,7 +2353,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) /* * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC satet has changed. + * thinking that APIC state has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ @@ -2330,6 +2362,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) return 0; nomem_free_apic: kfree(apic); + vcpu->arch.apic = NULL; nomem: return -ENOMEM; } @@ -2339,7 +2372,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; - if (!apic_enabled(apic)) + if (!kvm_apic_hw_enabled(apic)) return -1; __apic_update_ppr(apic, &ppr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d6d049ba3045..36747174e4a8 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -32,6 +32,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; u32 timer_advance_ns; + s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; bool timer_advance_adjust_done; @@ -129,6 +130,11 @@ void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) +static inline void kvm_lapic_clear_vector(int vec, void *bitmap) +{ + clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline void kvm_lapic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -219,7 +225,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); -void wait_lapic_expire(struct kvm_vcpu *vcpu); +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 98f6e4f88b04..9a5814d8d194 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -140,9 +140,6 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) @@ -259,11 +256,20 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; */ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +static u8 __read_mostly shadow_phys_bits; static void mmu_spte_set(u64 *sptep, u64 spte); +static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + static inline bool kvm_available_flush_tlb_with_range(void) { @@ -468,6 +474,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected + * in CPU detection code, but MKTME treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore for MKTME + * we should still return physical address bits reported by CPUID. + */ + if (!boot_cpu_has(X86_FEATURE_TME) || + WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) + return boot_cpu_data.x86_phys_bits; + + return cpuid_eax(0x80000008) & 0xff; +} + static void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; @@ -481,6 +502,8 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_present_mask = 0; shadow_acc_track_mask = 0; + shadow_phys_bits = kvm_get_shadow_phys_bits(); + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is @@ -650,7 +673,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from - * gup_get_pte(arch/x86/mm/gup.c). + * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because kvm_set_pte_rmapp * coalesces them and we are running out of the MMU lock. Therefore @@ -1073,10 +1096,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) { - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else + if (!sp->role.direct) { sp->gfns[index] = gfn; + return; + } + + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) + pr_err_ratelimited("gfn mismatch under direct page %llx " + "(expected %llx, got %llx)\n", + sp->gfn, + kvm_mmu_page_get_gfn(sp, index), gfn); } /* @@ -3055,10 +3084,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, - *sptep, sptep); + trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -3070,8 +3096,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, } } - kvm_release_pfn_clean(pfn); - return ret; } @@ -3106,9 +3130,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret <= 0) return -1; - for (i = 0; i < ret; i++, gfn++, start++) + for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); + put_page(pages[i]); + } return 0; } @@ -3156,40 +3182,40 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, + int map_writable, int level, kvm_pfn_t pfn, + bool prefault) { - struct kvm_shadow_walk_iterator iterator; + struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + gfn_t base_gfn = gfn; if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return 0; + return RET_PF_RETRY; - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, level, gfn, pfn, prefault, - map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; + trace_kvm_mmu_spte_requested(gpa, level, pfn); + for_each_shadow_entry(vcpu, gpa, it) { + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == level) break; - } - drop_large_spte(vcpu, iterator.sptep); - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; + drop_large_spte(vcpu, it.sptep); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, 1, ACC_ALL); - - link_shadow_page(vcpu, iterator.sptep, sp); + link_shadow_page(vcpu, it.sptep, sp); } } - return emulate; + + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, + write, level, base_gfn, pfn, prefault, + map_writable); + direct_pte_prefetch(vcpu, it.sptep); + ++vcpu->stat.pf_fixed; + return ret; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -3216,11 +3242,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, kvm_pfn_t *pfnp, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; int level = *levelp; /* @@ -3247,8 +3272,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; kvm_get_pfn(pfn); @@ -3505,22 +3528,19 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -4015,19 +4035,6 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) - return false; - - if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { @@ -4147,22 +4154,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4494,7 +4498,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4531,13 +4535,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, false); if (!shadow_me_mask) @@ -4558,7 +4562,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, execonly); + shadow_phys_bits, execonly); } #define BYTE_MASK(access) \ @@ -5935,7 +5939,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -5977,7 +5981,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); return freed; } @@ -5999,6 +6003,34 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + + /* + * Set the reserved bits and the present bit of an paging-structure + * entry to generate page fault with PFER.RSV = 1. + */ + + /* + * Mask the uppermost physical address bit, which would be reserved as + * long as the supported physical address width is less than 52. + */ + mask = 1ull << 51; + + /* Set the present bit. */ + mask |= 1ull; + + /* + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ + if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) + mask &= ~1ull; + + kvm_mmu_set_mmio_spte_mask(mask, mask); +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; @@ -6015,6 +6047,8 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); + kvm_set_mmio_spte_mask(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index dd30dccd2ad5..d8001b4bca05 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -301,6 +301,65 @@ TRACE_EVENT( __entry->kvm_gen == __entry->spte_gen ) ); + +TRACE_EVENT( + kvm_mmu_set_spte, + TP_PROTO(int level, gfn_t gfn, u64 *sptep), + TP_ARGS(level, gfn, sptep), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, spte) + __field(u64, sptep) + __field(u8, level) + /* These depend on page entry type, so compute them now. */ + __field(bool, r) + __field(bool, x) + __field(u8, u) + ), + + TP_fast_assign( + __entry->gfn = gfn; + __entry->spte = *sptep; + __entry->sptep = virt_to_phys(sptep); + __entry->level = level; + __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); + __entry->x = is_executable_pte(__entry->spte); + __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + ), + + TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", + __entry->gfn, __entry->spte, + __entry->r ? "r" : "-", + __entry->spte & PT_WRITABLE_MASK ? "w" : "-", + __entry->x ? "x" : "-", + __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->level, __entry->sptep + ) +); + +TRACE_EVENT( + kvm_mmu_spte_requested, + TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), + TP_ARGS(addr, level, pfn), + + TP_STRUCT__entry( + __field(u64, gfn) + __field(u64, pfn) + __field(u8, level) + ), + + TP_fast_assign( + __entry->gfn = addr >> PAGE_SHIFT; + __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + __entry->level = level; + ), + + TP_printk("gfn %llx pfn %llx level %d", + __entry->gfn, __entry->pfn, __entry->level + ) +); + #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d583bcd119fc..7d5cdb3af594 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -540,6 +540,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + kvm_release_pfn_clean(pfn); return true; } @@ -619,6 +620,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; + gfn_t base_gfn; direct_access = gw->pte_access; @@ -663,35 +665,34 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - for (; - shadow_walk_okay(&it) && it.level > hlevel; - shadow_walk_next(&it)) { - gfn_t direct_gfn; + base_gfn = gw->gfn; + + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == hlevel) + break; + validate_direct_spte(vcpu, it.sptep, direct_access); drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) - continue; - - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); + if (!is_shadow_present_pte(*it.sptep)) { + sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + it.level - 1, true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); + } } - clear_sp_write_flooding_count(it.sptep); ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + it.level, base_gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - + ++vcpu->stat.pf_fixed; return ret; out_gpte_changed: - kvm_release_pfn_clean(pfn); return RET_PF_RETRY; } @@ -839,6 +840,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.pte_access &= ~ACC_EXEC_MASK; } + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; @@ -847,19 +849,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault); - ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return RET_PF_RETRY; + return r; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 132d149494d6..aa5a2597305a 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -19,6 +19,9 @@ #include "lapic.h" #include "pmu.h" +/* This keeps the total size of the filter under 4k. */ +#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63 + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -141,6 +144,10 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu_event_filter *filter; + int i; + bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -152,6 +159,22 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (filter) { + for (i = 0; i < filter->nevents; i++) + if (filter->events[i] == + (eventsel & AMD64_RAW_EVENT_MASK_NB)) + break; + if (filter->action == KVM_PMU_EVENT_ALLOW && + i == filter->nevents) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_DENY && + i < filter->nevents) + allow_event = false; + } + if (!allow_event) + return; + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; @@ -261,10 +284,10 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) ctr_val = rdtsc(); break; case VMWARE_BACKDOOR_PMC_REAL_TIME: - ctr_val = ktime_get_boot_ns(); + ctr_val = ktime_get_boottime_ns(); break; case VMWARE_BACKDOOR_PMC_APPARENT_TIME: - ctr_val = ktime_get_boot_ns() + + ctr_val = ktime_get_boottime_ns() + vcpu->kvm->arch.kvmclock_offset; break; default: @@ -348,3 +371,43 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); } + +int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_pmu_event_filter tmp, *filter; + size_t size; + int r; + + if (copy_from_user(&tmp, argp, sizeof(tmp))) + return -EFAULT; + + if (tmp.action != KVM_PMU_EVENT_ALLOW && + tmp.action != KVM_PMU_EVENT_DENY) + return -EINVAL; + + if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) + return -E2BIG; + + size = struct_size(filter, events, tmp.nevents); + filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + if (!filter) + return -ENOMEM; + + r = -EFAULT; + if (copy_from_user(filter, argp, size)) + goto cleanup; + + /* Ensure nevents can't be changed between the user copies. */ + *filter = tmp; + + mutex_lock(&kvm->lock); + rcu_swap_protected(kvm->arch.pmu_event_filter, filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); + + synchronize_srcu_expedited(&kvm->srcu); + r = 0; +cleanup: + kfree(filter); + return r; +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 22dff661145a..58265f761c3b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -118,6 +118,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 48c865a4e5dd..583b9fa656f3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -364,6 +364,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* enable/disable Next RIP Save */ +static int nrips = true; +module_param(nrips, int, 0444); + /* enable/disable Virtual VMLOAD VMSAVE */ static int vls = true; module_param(vls, int, 0444); @@ -770,7 +774,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->vmcb->control.next_rip != 0) { + if (nrips && svm->vmcb->control.next_rip != 0) { WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; } @@ -807,7 +811,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(&svm->vcpu); - if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { + if (nr == BP_VECTOR && !nrips) { unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); /* @@ -1364,6 +1368,11 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + if (avic) { if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC) || @@ -3290,7 +3299,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info_err, KVM_ISA_SVM); - rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); @@ -3580,7 +3589,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) vmcb_gpa = svm->vmcb->save.rax; - rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); @@ -3935,7 +3944,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) { int err; - if (!static_cpu_has(X86_FEATURE_NRIPS)) + if (!nrips) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); @@ -5160,10 +5169,13 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_lapic_set_irr(vec, vcpu->arch.apic); smp_mb__after_atomic(); - if (avic_vcpu_is_running(vcpu)) - wrmsrl(SVM_AVIC_DOORBELL, - kvm_cpu_get_apicid(vcpu->cpu)); - else + if (avic_vcpu_is_running(vcpu)) { + int cpuid = vcpu->cpu; + + if (cpuid != get_cpu()) + wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); + put_cpu(); + } else kvm_vcpu_wake_up(vcpu); } @@ -5640,6 +5652,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xcr0(vcpu); + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + kvm_wait_lapic_expire(vcpu); + /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there @@ -5861,9 +5877,9 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static void svm_check_processor_compat(void *rtn) +static int __init svm_check_processor_compat(void) { - *(int *)rtn = 0; + return 0; } static bool svm_cpu_has_accelerated_tpr(void) @@ -5875,6 +5891,7 @@ static bool svm_has_emulated_msr(int index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; default: break; @@ -6162,15 +6179,9 @@ out: return ret; } -static void svm_handle_external_intr(struct kvm_vcpu *vcpu) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - local_irq_enable(); - /* - * We must have an instruction with interrupts enabled, so - * the timer interrupt isn't delayed by the interrupt shadow. - */ - asm("nop"); - local_irq_disable(); + } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) @@ -7256,7 +7267,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_tdp_cr3 = set_tdp_cr3, .check_intercept = svm_check_intercept, - .handle_external_intr = svm_handle_external_intr, + .handle_exit_irqoff = svm_handle_exit_irqoff, .request_immediate_exit = __kvm_request_immediate_exit, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4d47a2631d1f..b5c831e79094 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1365,7 +1365,7 @@ TRACE_EVENT(kvm_hv_timer_state, __entry->vcpu_id = vcpu_id; __entry->hv_timer_in_use = hv_timer_in_use; ), - TP_printk("vcpu_id %x hv_timer %x\n", + TP_printk("vcpu_id %x hv_timer %x", __entry->vcpu_id, __entry->hv_timer_in_use) ); diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 5466c6d85cf3..72359709cdc1 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/smp.h> +#include "../hyperv.h" #include "evmcs.h" #include "vmcs.h" #include "vmx.h" @@ -313,6 +314,23 @@ void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) } #endif +bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) +{ + struct hv_vp_assist_page assist_page; + + *evmcs_gpa = -1ull; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return false; + + if (unlikely(!assist_page.enlighten_vmentry)) + return false; + + *evmcs_gpa = assist_page.current_nested_vmcs; + + return true; +} + uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index e0fcef85b332..39a24eec8884 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -195,6 +195,7 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5f9c1a200201..bb509c254939 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -41,15 +41,19 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) -static u16 shadow_read_only_fields[] = { -#define SHADOW_FIELD_RO(x) x, +struct shadow_vmcs_field { + u16 encoding; + u16 offset; +}; +static struct shadow_vmcs_field shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static u16 shadow_read_write_fields[] = { -#define SHADOW_FIELD_RW(x) x, +static struct shadow_vmcs_field shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_write_fields = @@ -63,34 +67,40 @@ static void init_vmcs_shadow_fields(void) memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); for (i = j = 0; i < max_shadow_read_only_fields; i++) { - u16 field = shadow_read_only_fields[i]; + struct shadow_vmcs_field entry = shadow_read_only_fields[i]; + u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_only_fields || - shadow_read_only_fields[i + 1] != field + 1)) + shadow_read_only_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_only_field %x\n", field + 1); clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 if (field & 1) +#ifdef CONFIG_X86_64 continue; +#else + entry.offset += sizeof(u32); #endif - if (j < i) - shadow_read_only_fields[j] = field; - j++; + shadow_read_only_fields[j++] = entry; } max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { - u16 field = shadow_read_write_fields[i]; + struct shadow_vmcs_field entry = shadow_read_write_fields[i]; + u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_write_fields || - shadow_read_write_fields[i + 1] != field + 1)) + shadow_read_write_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_write_field %x\n", field + 1); + WARN_ONCE(field >= GUEST_ES_AR_BYTES && + field <= GUEST_TR_AR_BYTES, + "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); + /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist @@ -115,13 +125,13 @@ static void init_vmcs_shadow_fields(void) clear_bit(field, vmx_vmwrite_bitmap); clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 if (field & 1) +#ifdef CONFIG_X86_64 continue; +#else + entry.offset += sizeof(u32); #endif - if (j < i) - shadow_read_write_fields[j] = field; - j++; + shadow_read_write_fields[j++] = entry; } max_shadow_read_write_fields = j; } @@ -182,7 +192,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } @@ -238,22 +248,41 @@ static void free_nested(struct kvm_vcpu *vcpu) free_loaded_vmcs(&vmx->nested.vmcs02); } +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; int cpu; if (vmx->loaded_vmcs == vmcs) return; cpu = get_cpu(); - vmx_vcpu_put(vcpu); + prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load(vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu); + vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vm_entry_controls_reset_shadow(vmx); - vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); } @@ -930,8 +959,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && - !nested_ept) { + if (is_pae_paging(vcpu) && !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; @@ -1105,14 +1133,6 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) vmx->nested.msrs.misc_low = data; vmx->nested.msrs.misc_high = data >> 32; - /* - * If L1 has read-only VM-exit information fields, use the - * less permissive vmx_vmwrite_bitmap to specify write - * permissions for the shadow VMCS. - */ - if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); - return 0; } @@ -1214,6 +1234,11 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_VMX_VMCS_ENUM: vmx->nested.msrs.vmcs_enum = data; return 0; + case MSR_IA32_VMX_VMFUNC: + if (data & ~vmx->nested.msrs.vmfunc_controls) + return -EINVAL; + vmx->nested.msrs.vmfunc_controls = data; + return 0; default: /* * The rest of the VMX capability MSRs do not support restore. @@ -1301,41 +1326,29 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) } /* - * Copy the writable VMCS shadow fields back to the VMCS12, in case - * they have been modified by the L1 guest. Note that the "read-only" - * VM-exit information fields are actually writable if the vCPU is - * configured to support "VMWRITE to any supported field in the VMCS." + * Copy the writable VMCS shadow fields back to the VMCS12, in case they have + * been modified by the L1 guest. Note, "writable" in this context means + * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of + * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" + * VM-exit information fields (which are actually writable if the vCPU is + * configured to support "VMWRITE to any supported field in the VMCS"). */ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { - const u16 *fields[] = { - shadow_read_write_fields, - shadow_read_only_fields - }; - const int max_fields[] = { - max_shadow_read_write_fields, - max_shadow_read_only_fields - }; - int i, q; - unsigned long field; - u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); + struct shadow_vmcs_field field; + unsigned long val; + int i; preempt_disable(); vmcs_load(shadow_vmcs); - for (q = 0; q < ARRAY_SIZE(fields); q++) { - for (i = 0; i < max_fields[q]; i++) { - field = fields[q][i]; - field_value = __vmcs_readl(field); - vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); - } - /* - * Skip the VM-exit information fields if they are read-only. - */ - if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - break; + for (i = 0; i < max_shadow_read_write_fields; i++) { + field = shadow_read_write_fields[i]; + val = __vmcs_readl(field.encoding); + vmcs12_write_any(vmcs12, field.encoding, field.offset, val); } vmcs_clear(shadow_vmcs); @@ -1346,7 +1359,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const u16 *fields[] = { + const struct shadow_vmcs_field *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -1354,18 +1367,20 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) max_shadow_read_write_fields, max_shadow_read_only_fields }; - int i, q; - unsigned long field; - u64 field_value = 0; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); + struct shadow_vmcs_field field; + unsigned long val; + int i, q; vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); - __vmcs_writel(field, field_value); + val = vmcs12_read_any(vmcs12, field.encoding, + field.offset); + __vmcs_writel(field.encoding, val); } } @@ -1623,7 +1638,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; * evmcs->host_idtr_base = vmcs12->host_idtr_base; * evmcs->host_rsp = vmcs12->host_rsp; - * sync_vmcs12() doesn't read these: + * sync_vmcs02_to_vmcs12() doesn't read these: * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; * evmcs->msr_bitmap = vmcs12->msr_bitmap; @@ -1768,26 +1783,22 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, bool from_launch) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_vp_assist_page assist_page; + bool evmcs_gpa_changed = false; + u64 evmcs_gpa; if (likely(!vmx->nested.enlightened_vmcs_enabled)) return 1; - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) return 1; - if (unlikely(!assist_page.enlighten_vmentry)) - return 1; - - if (unlikely(assist_page.current_nested_vmcs != - vmx->nested.hv_evmcs_vmptr)) { - + if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { if (!vmx->nested.hv_evmcs) vmx->nested.current_vmptr = -1ull; nested_release_evmcs(vcpu); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs), + if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) return 0; @@ -1822,15 +1833,9 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } vmx->nested.dirty_vmcs12 = true; - /* - * As we keep L2 state for one guest only 'hv_clean_fields' mask - * can't be used when we switch between them. Reset it here for - * simplicity. - */ - vmx->nested.hv_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + vmx->nested.hv_evmcs_vmptr = evmcs_gpa; + evmcs_gpa_changed = true; /* * Unlike normal vmcs12, enlightened vmcs12 is not fully * reloaded from guest's memory (read only fields, fields not @@ -1844,10 +1849,19 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } } + + /* + * Clean fields data can't de used on VMLAUNCH and when we switch + * between different L2 guests as KVM keeps a single VMCS12 per L1. + */ + if (from_launch || evmcs_gpa_changed) + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + return 1; } -void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) +void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1868,7 +1882,7 @@ void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) copy_vmcs12_to_shadow(vmx); } - vmx->nested.need_vmcs12_sync = false; + vmx->nested.need_vmcs12_to_shadow_sync = false; } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) @@ -1948,8 +1962,20 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); - if (enable_pml) + /* + * The PML address never changes, so it is constant in vmcs02. + * Conceptually we want to copy the PML index from vmcs01 here, + * and then back to vmcs01 on nested vmexit. But since we flush + * the log and reset GUEST_PML_INDEX on each vmexit, the PML + * index is also effectively constant in vmcs02. + */ + if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); /* * Set the MSR load/store lists to match L0's settings. Only the @@ -1963,7 +1989,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmx_set_constant_host_state(vmx); } -static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, +static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { prepare_vmcs02_constant_state(vmx); @@ -1984,17 +2010,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) - prepare_vmcs02_early_full(vmx, vmcs12); + prepare_vmcs02_early_rare(vmx, vmcs12); /* * PIN CONTROLS */ - exec_control = vmcs12->pin_based_vm_exec_control; - - /* Preemption timer setting is computed directly in vmx_vcpu_run. */ - exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - vmx->loaded_vmcs->hv_timer_armed = false; + exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control |= (vmcs12->pin_based_vm_exec_control & + ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { @@ -2003,7 +2026,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } else { exec_control &= ~PIN_BASED_POSTED_INTR; } - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); + pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS @@ -2014,28 +2037,31 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; - /* - * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if - * nested_get_vmcs12_pages can't fix it up, the illegal value - * will result in a VM entry failure. - */ - if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); - } else { #ifdef CONFIG_X86_64 + else exec_control |= CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING; #endif - } /* * A vmexit (to either L1 hypervisor or L0 userspace) is always needed * for I/O port accesses. */ - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + + /* + * This bit will be computed in nested_get_vmcs12_pages, because + * we do not have access to L1's MSR bitmap yet. For now, keep + * the same bit as before, hoping to avoid multiple VMWRITEs that + * only set/clear this bit. + */ + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; + + exec_controls_set(vmx, exec_control); /* * SECONDARY EXEC CONTROLS @@ -2061,22 +2087,19 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) - vmcs_write16(GUEST_INTR_STATUS, - vmcs12->guest_intr_status); - /* - * Write an illegal value to APIC_ACCESS_ADDR. Later, - * nested_get_vmcs12_pages will either fix it up or - * remove the VM execution control. + * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() + * will not have to rewrite the controls just for this bit. */ - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) - vmcs_write64(APIC_ACCESS_ADDR, -1ull); + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && + (vmcs12->guest_cr4 & X86_CR4_UMIP)) + exec_control |= SECONDARY_EXEC_DESC; - if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + secondary_exec_controls_set(vmx, exec_control); } /* @@ -2095,7 +2118,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (guest_efer != host_efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } - vm_entry_controls_init(vmx, exec_control); + vm_entry_controls_set(vmx, exec_control); /* * EXIT CONTROLS @@ -2107,17 +2130,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control = vmx_vmexit_ctrl(); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; - vm_exit_controls_init(vmx, exec_control); - - /* - * Conceptually we want to copy the PML address and index from - * vmcs01 here, and then back to vmcs01 on nested vmexit. But, - * since we always flush the log on each vmexit and never change - * the PML address (once set), this happens to be equivalent to - * simply resetting the index in vmcs02. - */ - if (enable_pml) - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vm_exit_controls_set(vmx, exec_control); /* * Interrupt/Exception Fields @@ -2138,7 +2151,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } } -static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; @@ -2162,6 +2175,8 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); @@ -2198,6 +2213,10 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } + + if (kvm_mpx_supported() && vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } if (nested_cpu_has_xsaves(vmcs12)) @@ -2233,14 +2252,6 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); set_cr4_guest_host_mask(vmx); - - if (kvm_mpx_supported()) { - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - else - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); - } } /* @@ -2259,20 +2270,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { - prepare_vmcs02_full(vmx, vmcs12); + if (vmx->nested.dirty_vmcs12 || hv_evmcs) { + prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; - } - /* - * First, the fields that are shadowed. This must be kept in sync - * with vmcs_shadow_fields.h. - */ - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + load_guest_pdptrs_vmcs12 = !hv_evmcs || + !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && @@ -2283,6 +2289,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); } + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -2372,6 +2381,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, entry_failure_code)) return -EINVAL; + /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ + if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && + is_pae_paging(vcpu)) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -2609,6 +2627,30 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, !kvm_pat_valid(vmcs12->host_ia32_pat)) return -EINVAL; + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + + if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || + vmcs12->host_cs_selector == 0 || + vmcs12->host_tr_selector == 0 || + (vmcs12->host_ss_selector == 0 && !ia32e)) + return -EINVAL; + +#ifdef CONFIG_X86_64 + if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) || + is_noncanonical_address(vmcs12->host_gs_base, vcpu) || + is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) || + is_noncanonical_address(vmcs12->host_idtr_base, vcpu) || + is_noncanonical_address(vmcs12->host_tr_base, vcpu)) + return -EINVAL; +#endif + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2616,8 +2658,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, * the host address-space size VM-exit control. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) @@ -2781,7 +2821,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" + : "memory" ); if (vmx->msr_autoload.host.nr) @@ -2851,18 +2891,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); } } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { map = &vmx->nested.virtual_apic_map; - /* - * If translation failed, VM entry will fail because - * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - */ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && @@ -2876,11 +2912,13 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * _not_ what the processor does but it's basically the * only possibility we have. */ - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_TPR_SHADOW); + exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); } else { - printk("bad virtual-APIC page address\n"); - dump_vmcs(); + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to + * force VM-Entry to fail. + */ + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); } } @@ -2896,11 +2934,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); + exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); + exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); } /* @@ -2953,7 +2989,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) u32 exit_reason = EXIT_REASON_INVALID_STATE; u32 exit_qual; - evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); @@ -2964,6 +3000,25 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + /* + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* + * nested early checks are disabled. In the event of a "late" VM-Fail, + * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its + * software model to the pre-VMEntry host state. When EPT is disabled, + * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes + * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing + * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to + * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested + * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is + * guaranteed to be overwritten with a shadow CR3 prior to re-entering + * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as + * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks + * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail + * path would need to manually save/restore vmcs01.GUEST_CR3. + */ + if (!enable_ept && !nested_early_check) + vmcs_writel(GUEST_CR3, vcpu->arch.cr3); + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); prepare_vmcs02_early(vmx, vmcs12); @@ -3059,7 +3114,7 @@ vmentry_fail_vmexit: vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; return 1; } @@ -3077,7 +3132,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) + if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) return 1; if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) @@ -3393,20 +3448,57 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; } -/* - * Update the guest state fields of vmcs12 to reflect changes that - * occurred while L2 was running. (The "IA-32e mode guest" bit of the - * VM-entry controls is also updated, since this is really a guest - * state bit.) - */ -static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); - vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); +static bool is_vmcs12_ext_field(unsigned long field) +{ + switch (field) { + case GUEST_ES_SELECTOR: + case GUEST_CS_SELECTOR: + case GUEST_SS_SELECTOR: + case GUEST_DS_SELECTOR: + case GUEST_FS_SELECTOR: + case GUEST_GS_SELECTOR: + case GUEST_LDTR_SELECTOR: + case GUEST_TR_SELECTOR: + case GUEST_ES_LIMIT: + case GUEST_CS_LIMIT: + case GUEST_SS_LIMIT: + case GUEST_DS_LIMIT: + case GUEST_FS_LIMIT: + case GUEST_GS_LIMIT: + case GUEST_LDTR_LIMIT: + case GUEST_TR_LIMIT: + case GUEST_GDTR_LIMIT: + case GUEST_IDTR_LIMIT: + case GUEST_ES_AR_BYTES: + case GUEST_DS_AR_BYTES: + case GUEST_FS_AR_BYTES: + case GUEST_GS_AR_BYTES: + case GUEST_LDTR_AR_BYTES: + case GUEST_TR_AR_BYTES: + case GUEST_ES_BASE: + case GUEST_CS_BASE: + case GUEST_SS_BASE: + case GUEST_DS_BASE: + case GUEST_FS_BASE: + case GUEST_GS_BASE: + case GUEST_LDTR_BASE: + case GUEST_TR_BASE: + case GUEST_GDTR_BASE: + case GUEST_IDTR_BASE: + case GUEST_PENDING_DBG_EXCEPTIONS: + case GUEST_BNDCFGS: + return true; + default: + break; + } - vmcs12->guest_rsp = kvm_rsp_read(vcpu); - vmcs12->guest_rip = kvm_rip_read(vcpu); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + return false; +} + +static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); @@ -3427,8 +3519,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); @@ -3444,11 +3534,69 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (kvm_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; +} + +static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) + return; + + + WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->nested.vmcs02; + vmx_vcpu_load(&vmx->vcpu, cpu); + + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_load(&vmx->vcpu, cpu); + put_cpu(); +} + +/* + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) + */ +static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->nested.hv_evmcs) + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; + + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_rsp = kvm_rsp_read(vcpu); + vmcs12->guest_rip = kvm_rip_read(vcpu); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; else @@ -3469,10 +3617,12 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ if (enable_ept) { vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); - vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); - vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); - vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); - vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } } vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); @@ -3484,22 +3634,11 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - } - /* TODO: These cannot have changed unless we have MSR bitmaps and - * the relevant bit asks not to trap the change */ - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); } /* @@ -3517,11 +3656,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { - /* update guest state fields: */ - sync_vmcs12(vcpu, vmcs12); - /* update exit information fields: */ - vmcs12->vm_exit_reason = exit_reason; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -3775,18 +3910,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); - - /* - * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 - * points to shadow pages! Fortunately we only get here after a WARN_ON - * if EPT is disabled, so a VMabort is perfectly fine. - */ - if (enable_ept) { - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - } else { - nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); - } + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -3794,7 +3919,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ - ept_save_pdptrs(vcpu); + if (enable_ept) + ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); @@ -3882,14 +4008,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vcpu->arch.tsc_offset -= vmcs12->tsc_offset; if (likely(!vmx->fail)) { - if (exit_reason == -1) - sync_vmcs12(vcpu, vmcs12); - else + sync_vmcs02_to_vmcs12(vcpu, vmcs12); + + if (exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); /* - * Must happen outside of sync_vmcs12() as it will + * Must happen outside of sync_vmcs02_to_vmcs12() as it will * also be used to capture vmcs12 cache as part of * capturing nVMX state for snapshot (migration). * @@ -3945,7 +4071,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -4008,7 +4134,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * #UD or #GP. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret) + u32 vmx_instruction_info, bool wr, int len, gva_t *ret) { gva_t off; bool exn; @@ -4115,7 +4241,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, */ if (!(s.base == 0 && s.limit == 0xffffffff && ((s.type & 8) || !(s.type & 4)))) - exn = exn || (off + sizeof(u64) > s.limit); + exn = exn || ((u64)off + len - 1 > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, @@ -4134,7 +4260,8 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) struct x86_exception e; if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + vmcs_read32(VMX_INSTRUCTION_INFO), false, + sizeof(*vmpointer), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { @@ -4300,11 +4427,13 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) if (vmx->nested.current_vmptr == -1ull) return; + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_sync = false; + vmx->nested.need_vmcs12_to_shadow_sync = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; @@ -4334,6 +4463,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; + u64 evmcs_gpa; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4349,10 +4479,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - if (vmx->nested.hv_evmcs_map.hva) { - if (vmptr == vmx->nested.hv_evmcs_vmptr) - nested_release_evmcs(vcpu); - } else { + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (likely(!vmx->nested.enlightened_vmcs_enabled || + !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); @@ -4386,8 +4524,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u64 field_value; unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + int len; gva_t gva = 0; struct vmcs12 *vmcs12; + short offset; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4409,11 +4549,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vmcs12, field, &field_value) < 0) + + offset = vmcs_field_to_offset(field); + if (offset < 0) return nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + /* Read the field, zero-extended to a u64 field_value */ + field_value = vmcs12_read_any(vmcs12, field, offset); + /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending @@ -4423,21 +4570,45 @@ static int handle_vmread(struct kvm_vcpu *vcpu) kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), field_value); } else { + len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &gva)) + vmx_instruction_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, - (is_long_mode(vcpu) ? 8 : 4), NULL); + kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); } return nested_vmx_succeed(vcpu); } +static bool is_shadow_field_rw(unsigned long field) +{ + switch (field) { +#define SHADOW_FIELD_RW(x, y) case x: +#include "vmcs_shadow_fields.h" + return true; + default: + break; + } + return false; +} + +static bool is_shadow_field_ro(unsigned long field) +{ + switch (field) { +#define SHADOW_FIELD_RO(x, y) case x: +#include "vmcs_shadow_fields.h" + return true; + default: + break; + } + return false; +} static int handle_vmwrite(struct kvm_vcpu *vcpu) { unsigned long field; + int len; gva_t gva; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4452,6 +4623,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; struct vmcs12 *vmcs12; + short offset; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4463,11 +4635,11 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); else { + len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, len, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, - (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { + if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } @@ -4484,9 +4656,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - if (!is_guest_mode(vcpu)) + if (!is_guest_mode(vcpu)) { vmcs12 = get_vmcs12(vcpu); - else { + + /* + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties + * vmcs12, else we may crush a field or consume a stale value. + */ + if (!is_shadow_field_rw(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + } else { /* * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE * to shadowed-field sets the ALU flags for VMfailInvalid. @@ -4496,28 +4675,46 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) vmcs12 = get_shadow_vmcs12(vcpu); } - if (vmcs12_write_any(vmcs12, field, field_value) < 0) + offset = vmcs_field_to_offset(field); + if (offset < 0) return nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* - * Do not track vmcs12 dirty-state if in guest-mode - * as we actually dirty shadow vmcs12 instead of vmcs12. + * Some Intel CPUs intentionally drop the reserved bits of the AR byte + * fields on VMWRITE. Emulate this behavior to ensure consistent KVM + * behavior regardless of the underlying hardware, e.g. if an AR_BYTE + * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD + * from L1 will return a different value than VMREAD from L2 (L1 sees + * the stripped down value, L2 sees the full value as stored by KVM). */ - if (!is_guest_mode(vcpu)) { - switch (field) { -#define SHADOW_FIELD_RW(x) case x: -#include "vmcs_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; + if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) + field_value &= 0x1f0ff; + + vmcs12_write_any(vmcs12, field, offset, field_value); + + /* + * Do not track vmcs12 dirty-state if in guest-mode as we actually + * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated + * by L1 without a vmexit are always updated in the vmcs02, i.e. don't + * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. + */ + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { + /* + * L1 can read these fields without exiting, ensure the + * shadow VMCS is up-to-date. + */ + if (enable_shadow_vmcs && is_shadow_field_ro(field)) { + preempt_disable(); + vmcs_load(vmx->vmcs01.shadow_vmcs); + + __vmcs_writel(field, field_value); + + vmcs_clear(vmx->vmcs01.shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + preempt_enable(); } + vmx->nested.dirty_vmcs12 = true; } return nested_vmx_succeed(vcpu); @@ -4527,11 +4724,10 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) { vmx->nested.current_vmptr = vmptr; if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; } @@ -4615,7 +4811,8 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) return 1; - if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, + true, sizeof(gpa_t), &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, @@ -4661,7 +4858,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); @@ -4670,13 +4867,11 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: + case VMX_EPT_EXTENT_CONTEXT: /* - * TODO: track mappings and invalidate - * single context requests appropriately + * TODO: Sync the necessary shadow EPT roots here, rather than + * at the next emulated VM-entry. */ - case VMX_EPT_EXTENT_CONTEXT: - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); break; default: BUG_ON(1); @@ -4723,7 +4918,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); @@ -5240,9 +5435,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) - kvm_state.flags |= KVM_STATE_NESTED_EVMCS; - if (nested_vmx_allowed(vcpu) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; @@ -5251,6 +5443,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); + if (vmx->nested.hv_evmcs) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; + if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) @@ -5284,12 +5479,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, * When running L2, the authoritative vmcs12 state is in the * vmcs02. When running L1, the authoritative vmcs12 state is * in the shadow or enlightened vmcs linked to vmcs01, unless - * need_vmcs12_sync is set, in which case, the authoritative + * need_vmcs12_to_shadow_sync is set, in which case, the authoritative * vmcs12 state is in the vmcs12 already. */ if (is_guest_mode(vcpu)) { - sync_vmcs12(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_sync) { + sync_vmcs02_to_vmcs12(vcpu, vmcs12); + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (vmx->nested.hv_evmcs) copy_enlightened_to_vmcs12(vmx); else if (enable_shadow_vmcs) @@ -5350,6 +5546,15 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) return -EINVAL; + /* + * KVM_STATE_NESTED_EVMCS used to signal that KVM should + * enable eVMCS capability on vCPU. However, since then + * code was changed such that flag signals vmcs12 should + * be copied into eVMCS in guest memory. + * + * To preserve backwards compatability, allow user + * to set this flag even when there is no VMXON region. + */ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { @@ -5358,7 +5563,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) return -EINVAL; - } + } if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) @@ -5373,20 +5578,21 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags * must be zero. */ - if (is_smm(vcpu) ? kvm_state->flags : kvm_state->hdr.vmx.smm.flags) + if (is_smm(vcpu) ? + (kvm_state->flags & + (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) + : kvm_state->hdr.vmx.smm.flags) return -EINVAL; if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; - vmx_leave_nested(vcpu); - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { - if (!nested_vmx_allowed(vcpu)) + if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && + (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; - nested_enable_evmcs(vcpu, NULL); - } + vmx_leave_nested(vcpu); if (kvm_state->hdr.vmx.vmxon_pa == -1ull) return 0; @@ -5411,7 +5617,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * Sync eVMCS upon entry as we may not have * HV_X64_MSR_VP_ASSIST_PAGE set up yet. */ - vmx->nested.need_vmcs12_sync = true; + vmx->nested.need_vmcs12_to_shadow_sync = true; } else { return -EINVAL; } @@ -5479,14 +5685,8 @@ error_guest_mode: void nested_vmx_vcpu_setup(void) { if (enable_shadow_vmcs) { - /* - * At vCPU creation, "VMWRITE to any supported field - * in the VMCS" is supported, so use the more - * permissive vmx_vmread_bitmap to specify both read - * and write permissions for the shadow VMCS. - */ vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } } @@ -5616,10 +5816,15 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | + SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_RDRAND_EXITING | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_XSAVES; /* * We can emulate "VMCS shadowing," even if the hardware @@ -5739,14 +5944,6 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; - /* - * Without EPT it is not possible to restore L1's CR3 and PDPTR on - * VMfail, because they are not available in vmcs01. Just always - * use hardware checks. - */ - if (!enable_ept) - nested_early_check = 1; - if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index e847ff1019a2..187d39bf0bf1 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -17,11 +17,11 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); -void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu); +void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret); + u32 vmx_instruction_info, bool wr, int len, gva_t *ret); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index b8e50f76fefc..2200fb698dd0 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -146,7 +146,6 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) __vmcs_writel(field, value); #ifndef CONFIG_X86_64 - asm volatile (""); __vmcs_writel(field+1, value >> 32); #endif } diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index cb6079f8a227..481ad879197b 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -42,6 +42,14 @@ struct vmcs_host_state { #endif }; +struct vmcs_controls_shadow { + u32 vm_entry; + u32 vm_exit; + u32 pin; + u32 exec; + u32 secondary_exec; +}; + /* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs @@ -53,7 +61,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - bool hv_timer_armed; + bool hv_timer_soft_disabled; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; @@ -61,6 +69,7 @@ struct loaded_vmcs { unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; struct vmcs_host_state host_state; + struct vmcs_controls_shadow controls_shadow; }; static inline bool is_exception_n(u32 intr_info, u8 vector) @@ -115,6 +124,12 @@ static inline bool is_nmi(u32 intr_info) == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); } +static inline bool is_external_intr(u32 intr_info) +{ + return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR); +} + enum vmcs_field_width { VMCS_FIELD_WIDTH_U16 = 0, VMCS_FIELD_WIDTH_U64 = 1, diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 337718fc8a36..d0c6df373f67 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -395,69 +395,48 @@ static inline short vmcs_field_to_offset(unsigned long field) #undef ROL16 -/* - * Read a vmcs12 field. Since these can have varying lengths and we return - * one type, we chose the biggest type (u64) and zero-extend the return value - * to that size. Note that the caller, handle_vmread, might need to use only - * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of - * 64-bit fields are to be returned). - */ -static inline int vmcs12_read_any(struct vmcs12 *vmcs12, - unsigned long field, u64 *ret) +static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset) { - short offset = vmcs_field_to_offset(field); - char *p; - - if (offset < 0) - return offset; - - p = (char *)vmcs12 + offset; + char *p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 0; + return *((natural_width *)p); case VMCS_FIELD_WIDTH_U16: - *ret = *((u16 *)p); - return 0; + return *((u16 *)p); case VMCS_FIELD_WIDTH_U32: - *ret = *((u32 *)p); - return 0; + return *((u32 *)p); case VMCS_FIELD_WIDTH_U64: - *ret = *((u64 *)p); - return 0; + return *((u64 *)p); default: - WARN_ON(1); - return -ENOENT; + WARN_ON_ONCE(1); + return -1; } } -static inline int vmcs12_write_any(struct vmcs12 *vmcs12, - unsigned long field, u64 field_value){ - short offset = vmcs_field_to_offset(field); +static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset, u64 field_value) +{ char *p = (char *)vmcs12 + offset; - if (offset < 0) - return offset; - switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; - return 0; + break; case VMCS_FIELD_WIDTH_U32: *(u32 *)p = field_value; - return 0; + break; case VMCS_FIELD_WIDTH_U64: *(u64 *)p = field_value; - return 0; + break; case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *(natural_width *)p = field_value; - return 0; + break; default: - WARN_ON(1); - return -ENOENT; + WARN_ON_ONCE(1); + break; } - } #endif /* __KVM_X86_VMX_VMCS12_H */ diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 132432f375c2..eb1ecd16fd22 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -1,8 +1,12 @@ +#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW) +BUILD_BUG_ON(1) +#endif + #ifndef SHADOW_FIELD_RO -#define SHADOW_FIELD_RO(x) +#define SHADOW_FIELD_RO(x, y) #endif #ifndef SHADOW_FIELD_RW -#define SHADOW_FIELD_RW(x) +#define SHADOW_FIELD_RW(x, y) #endif /* @@ -28,47 +32,48 @@ */ /* 16-bits */ -SHADOW_FIELD_RW(GUEST_INTR_STATUS) -SHADOW_FIELD_RW(GUEST_PML_INDEX) -SHADOW_FIELD_RW(HOST_FS_SELECTOR) -SHADOW_FIELD_RW(HOST_GS_SELECTOR) +SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status) +SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index) +SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector) +SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector) /* 32-bits */ -SHADOW_FIELD_RO(VM_EXIT_REASON) -SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) -SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) -SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) -SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) -SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) -SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) -SHADOW_FIELD_RW(EXCEPTION_BITMAP) -SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) -SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) -SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) -SHADOW_FIELD_RW(TPR_THRESHOLD) -SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) -SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) +SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code) +SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes) +SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control) +SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control) +SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len) +SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value) /* Natural width */ -SHADOW_FIELD_RO(EXIT_QUALIFICATION) -SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) -SHADOW_FIELD_RW(GUEST_RIP) -SHADOW_FIELD_RW(GUEST_RSP) -SHADOW_FIELD_RW(GUEST_CR0) -SHADOW_FIELD_RW(GUEST_CR3) -SHADOW_FIELD_RW(GUEST_CR4) -SHADOW_FIELD_RW(GUEST_RFLAGS) -SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) -SHADOW_FIELD_RW(CR0_READ_SHADOW) -SHADOW_FIELD_RW(CR4_READ_SHADOW) -SHADOW_FIELD_RW(HOST_FS_BASE) -SHADOW_FIELD_RW(HOST_GS_BASE) +SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address) +SHADOW_FIELD_RW(GUEST_RIP, guest_rip) +SHADOW_FIELD_RW(GUEST_RSP, guest_rsp) +SHADOW_FIELD_RW(GUEST_CR0, guest_cr0) +SHADOW_FIELD_RW(GUEST_CR3, guest_cr3) +SHADOW_FIELD_RW(GUEST_CR4, guest_cr4) +SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask) +SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow) +SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow) +SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base) +SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base) /* 64-bit */ -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address) #undef SHADOW_FIELD_RO #undef SHADOW_FIELD_RW diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d98eac371c0a..69536553446d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -389,6 +389,7 @@ static const struct kvm_vmx_segment_field { }; u64 host_efer; +static unsigned long host_idt_base; /* * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm @@ -1035,6 +1036,33 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) +{ + if (unlikely(fs_sel != host->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host->gs_sel = gs_sel; + } + if (unlikely(fs_base != host->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host->fs_base = fs_base; + } + if (unlikely(gs_base != host->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host->gs_base = gs_base; + } +} + void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1053,20 +1081,18 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ - if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { - vmx->guest_msrs_dirty = false; + if (!vmx->guest_msrs_ready) { + vmx->guest_msrs_ready = true; for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, vmx->guest_msrs[i].mask); } - - if (vmx->loaded_cpu_state) + if (vmx->guest_state_loaded) return; - vmx->loaded_cpu_state = vmx->loaded_vmcs; - host_state = &vmx->loaded_cpu_state->host_state; + host_state = &vmx->loaded_vmcs->host_state; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not @@ -1100,42 +1126,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - if (unlikely(fs_sel != host_state->fs_sel)) { - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else - vmcs_write16(HOST_FS_SELECTOR, 0); - host_state->fs_sel = fs_sel; - } - if (unlikely(gs_sel != host_state->gs_sel)) { - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else - vmcs_write16(HOST_GS_SELECTOR, 0); - host_state->gs_sel = gs_sel; - } - if (unlikely(fs_base != host_state->fs_base)) { - vmcs_writel(HOST_FS_BASE, fs_base); - host_state->fs_base = fs_base; - } - if (unlikely(gs_base != host_state->gs_base)) { - vmcs_writel(HOST_GS_BASE, gs_base); - host_state->gs_base = gs_base; - } + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + vmx->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->loaded_cpu_state) + if (!vmx->guest_state_loaded) return; - WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); - host_state = &vmx->loaded_cpu_state->host_state; + host_state = &vmx->loaded_vmcs->host_state; ++vmx->vcpu.stat.host_state_reload; - vmx->loaded_cpu_state = NULL; #ifdef CONFIG_X86_64 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -1161,13 +1165,15 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); + vmx->guest_state_loaded = false; + vmx->guest_msrs_ready = false; } #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->loaded_cpu_state) + if (vmx->guest_state_loaded) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; @@ -1176,7 +1182,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->loaded_cpu_state) + if (vmx->guest_state_loaded) wrmsrl(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; @@ -1225,11 +1231,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) pi_set_on(pi_desc); } -/* - * Switches to specified vcpu, until a matching vcpu_put(), but assumes - * vcpu mutex is already taken. - */ -void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1290,8 +1292,20 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_has_tsc_control && vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) decache_tsc_multiplier(vmx); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); vmx->host_debugctlmsr = get_debugctlmsr(); } @@ -1310,7 +1324,7 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) pi_set_sn(pi_desc); } -void vmx_vcpu_put(struct kvm_vcpu *vcpu) +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1579,7 +1593,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_dirty = true; + vmx->guest_msrs_ready = false; if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); @@ -1692,9 +1706,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; - case MSR_IA32_POWER_CTL: - msr_info->data = vmx->msr_ia32_power_ctl; - break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1718,7 +1729,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data); case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) + if (!vmx_xsaves_supported() || + (!msr_info->host_initiated && + !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) return 1; msr_info->data = vcpu->arch.ia32_xss; break; @@ -1817,17 +1831,28 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif case MSR_IA32_SYSENTER_CS: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_cs = data; vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_eip = data; vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_esp = data; vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_POWER_CTL: - vmx->msr_ia32_power_ctl = data; + case MSR_IA32_DEBUGCTLMSR: + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; + + ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1896,9 +1921,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) MSR_TYPE_W); break; case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + + if (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) + get_vmcs12(vcpu)->guest_ia32_pat = data; + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_pat_valid(data)) - return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; @@ -1932,7 +1962,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) + if (!vmx_xsaves_supported() || + (!msr_info->host_initiated && + !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) return 1; /* * The only supported bit as of Skylake is bit 8, but @@ -2435,6 +2468,7 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) return -ENOMEM; loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs->hv_timer_soft_disabled = false; loaded_vmcs_init(loaded_vmcs); if (cpu_has_vmx_msr_bitmap()) { @@ -2455,6 +2489,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + memset(&loaded_vmcs->controls_shadow, 0, + sizeof(struct vmcs_controls_shadow)); return 0; @@ -2737,7 +2773,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty)) return; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); @@ -2749,7 +2785,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (is_pae_paging(vcpu)) { mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); @@ -2766,22 +2802,20 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) vmx_decache_cr3(vcpu); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | - (CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); + exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); + exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } @@ -2881,6 +2915,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control @@ -2891,20 +2926,19 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; - else if (to_vmx(vcpu)->rmode.vm86_active) + else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { if (cr4 & X86_CR4_UMIP) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); + } } if (cr4 & X86_CR4_VMXE) { @@ -2919,7 +2953,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) + if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -3537,7 +3571,7 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) u8 mode = 0; if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + (secondary_exec_controls_get(to_vmx(vcpu)) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { mode |= MSR_BITMAP_MODE_X2APIC; if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) @@ -3731,7 +3765,6 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; - struct desc_ptr dt; unsigned long cr0, cr3, cr4; cr0 = read_cr0(); @@ -3767,9 +3800,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; + vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ @@ -3798,7 +3829,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } -static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; @@ -3808,8 +3839,9 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!enable_vnmi) pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; - /* Enable the preemption timer dynamically */ - pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + if (!enable_preemption_timer) + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + return pin_based_exec_ctrl; } @@ -3817,14 +3849,14 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); else - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } @@ -4015,15 +4047,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); vmx->hv_deadline_tsc = -1; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); + exec_controls_set(vmx, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - vmx->secondary_exec_control); + secondary_exec_controls_set(vmx, vmx->secondary_exec_control); } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { @@ -4081,10 +4112,10 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); + vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ - vm_entry_controls_init(vmx, vmx_vmentry_ctrl()); + vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); @@ -4208,8 +4239,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void enable_irq_window(struct kvm_vcpu *vcpu) { - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -4220,8 +4250,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4442,11 +4471,11 @@ static void kvm_machine_check(void) static int handle_machine_check(struct kvm_vcpu *vcpu) { - /* already handled by vcpu_run */ + /* handled by vmx_vcpu_run() */ return 1; } -static int handle_exception(struct kvm_vcpu *vcpu) +static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -4458,11 +4487,8 @@ static int handle_exception(struct kvm_vcpu *vcpu) vect_info = vmx->idt_vectoring_info; intr_info = vmx->exit_intr_info; - if (is_machine_check(intr_info)) - return handle_machine_check(vcpu); - - if (is_nmi(intr_info)) - return 1; /* already handled by vmx_vcpu_run() */ + if (is_machine_check(intr_info) || is_nmi(intr_info)) + return 1; /* handled by handle_exception_nmi_irqoff() */ if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); @@ -4518,7 +4544,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) skip_emulated_instruction(vcpu); @@ -4763,7 +4789,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= DR6_BD | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -4771,8 +4797,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) } if (vcpu->guest_debug == 0) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_MOV_DR_EXITING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); /* * No more DR vmexits; force a reload of the debug registers @@ -4816,7 +4841,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -4876,8 +4901,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5131,8 +5155,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!enable_vnmi); - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5144,7 +5167,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; int ret = 1; - u32 cpu_exec_ctrl; bool intr_window_requested; unsigned count = 130; @@ -5155,8 +5177,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; + intr_window_requested = exec_controls_get(vmx) & + CPU_BASED_VIRTUAL_INTR_PENDING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) @@ -5342,7 +5364,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) * is read even if it isn't needed (e.g., for type==all) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) + vmx_instruction_info, false, + sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { @@ -5437,8 +5460,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - if (!to_vmx(vcpu)->req_immediate_exit) + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->req_immediate_exit && + !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) kvm_lapic_expired_hv_timer(vcpu); + return 1; } @@ -5469,7 +5496,7 @@ static int handle_encls(struct kvm_vcpu *vcpu) * to be done to userspace and return 0. */ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { - [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, @@ -5952,6 +5979,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 sec_exec_control; if (!lapic_in_kernel(vcpu)) @@ -5963,11 +5991,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; + vmx->nested.change_vmcs01_virtual_apic_mode = true; return; } - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control = secondary_exec_controls_get(vmx); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -5989,7 +6017,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; break; } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + secondary_exec_controls_set(vmx, sec_exec_control); vmx_update_msr_bitmap(vcpu); } @@ -6107,76 +6135,81 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) +static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - u32 exit_intr_info = 0; - u16 basic_exit_reason = (u16)vmx->exit_reason; - - if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) - return; - - if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmx->exit_intr_info = exit_intr_info; + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* if exit due to PF check for async PF */ - if (is_page_fault(exit_intr_info)) + if (is_page_fault(vmx->exit_intr_info)) vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || - is_machine_check(exit_intr_info)) + if (is_machine_check(vmx->exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(exit_intr_info)) { + if (is_nmi(vmx->exit_intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); } } -static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) - == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { - unsigned int vector; - unsigned long entry; - gate_desc *desc; - struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int vector; + unsigned long entry; #ifdef CONFIG_X86_64 - unsigned long tmp; + unsigned long tmp; #endif + gate_desc *desc; + u32 intr_info; + + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (WARN_ONCE(!is_external_intr(intr_info), + "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + return; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(desc); - asm volatile( + vector = intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)host_idt_base + vector; + entry = gate_offset(desc); + + kvm_before_interrupt(vcpu); + + asm volatile( #ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" - "push %[sp]\n\t" + "mov %%" _ASM_SP ", %[sp]\n\t" + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" + "push $%c[ss]\n\t" + "push %[sp]\n\t" #endif - "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" - CALL_NOSPEC - : + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" + CALL_NOSPEC + : #ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), + [sp]"=&r"(tmp), #endif - ASM_CALL_CONSTRAINT - : - THUNK_TARGET(entry), - [ss]"i"(__KERNEL_DS), - [cs]"i"(__KERNEL_CS) - ); - } + ASM_CALL_CONSTRAINT + : + THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); + + kvm_after_interrupt(vcpu); +} +STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); + +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu); + else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) + handle_exception_nmi_irqoff(vmx); } -STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); static bool vmx_has_emulated_msr(int index) { @@ -6187,6 +6220,8 @@ static bool vmx_has_emulated_msr(int index) * real mode. */ return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return nested; case MSR_AMD64_VIRT_SPEC_CTRL: /* This is AMD only. */ return false; @@ -6332,15 +6367,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); - if (!vmx->loaded_vmcs->hv_timer_armed) - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = true; -} - static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6348,11 +6374,9 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) u32 delta_tsc; if (vmx->req_immediate_exit) { - vmx_arm_hv_timer(vmx, 0); - return; - } - - if (vmx->hv_deadline_tsc != -1) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (vmx->hv_deadline_tsc != -1) { tscl = rdtsc(); if (vmx->hv_deadline_tsc > tscl) /* set_hv_timer ensures the delta fits in 32-bits */ @@ -6361,14 +6385,12 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) else delta_tsc = 0; - vmx_arm_hv_timer(vmx, delta_tsc); - return; + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); + vmx->loaded_vmcs->hv_timer_soft_disabled = true; } - - if (vmx->loaded_vmcs->hv_timer_armed) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = false; } void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) @@ -6401,8 +6423,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_write32(PLE_WINDOW, vmx->ple_window); } - if (vmx->nested.need_vmcs12_sync) - nested_sync_from_vmcs12(vcpu); + if (vmx->nested.need_vmcs12_to_shadow_sync) + nested_sync_vmcs12_to_shadow(vcpu); if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); @@ -6440,7 +6462,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_update_hv_timer(vcpu); + if (enable_preemption_timer) + vmx_update_hv_timer(vcpu); + + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -6533,13 +6560,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = 0; vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + kvm_machine_check(); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) return; vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } @@ -6630,6 +6659,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(kvm)) { + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; @@ -6726,22 +6761,22 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static void __init vmx_check_processor_compat(void *rtn) +static int __init vmx_check_processor_compat(void) { struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; - *(int *)rtn = 0; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - *(int *)rtn = -EIO; + return -EIO; if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, enable_apicv); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); - *(int *)rtn = -EIO; + return -EIO; } + return 0; } static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) @@ -6795,7 +6830,7 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } -static void vmcs_set_secondary_exec_control(u32 new_ctl) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) { /* * These bits in the secondary execution controls field @@ -6809,10 +6844,10 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + u32 new_ctl = vmx->secondary_exec_control; + u32 cur_ctl = secondary_exec_controls_get(vmx); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - (new_ctl & ~mask) | (cur_ctl & mask)); + secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); } /* @@ -6950,7 +6985,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx->secondary_exec_control); + vmcs_set_secondary_exec_control(vmx); } if (nested_vmx_allowed(vcpu)) @@ -7424,10 +7459,14 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) static __init int hardware_setup(void) { unsigned long host_bndcfgs; + struct desc_ptr dt; int r, i; rdmsrl_safe(MSR_EFER, &host_efer); + store_idt(&dt); + host_idt_base = dt.address; + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); @@ -7531,17 +7570,33 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_preemption_timer()) - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + enable_preemption_timer = false; - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + if (enable_preemption_timer) { + u64 use_timer_freq = 5000ULL * 1000 * 1000; u64 vmx_msr; rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); cpu_preemption_timer_multi = vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { + + if (tsc_khz) + use_timer_freq = (u64)tsc_khz * 1000; + use_timer_freq >>= cpu_preemption_timer_multi; + + /* + * KVM "disables" the preemption timer by setting it to its max + * value. Don't use the timer if it might cause spurious exits + * at a rate faster than 0.1 Hz (of uninterrupted guest time). + */ + if (use_timer_freq > 0xffffffffu / 10) + enable_preemption_timer = false; + } + + if (!enable_preemption_timer) { kvm_x86_ops->set_hv_timer = NULL; kvm_x86_ops->cancel_hv_timer = NULL; + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; } kvm_set_posted_intr_wakeup_handler(wakeup_handler); @@ -7683,7 +7738,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_tdp_cr3 = vmx_set_cr3, .check_intercept = vmx_check_intercept, - .handle_external_intr = vmx_handle_external_intr, + .handle_exit_irqoff = vmx_handle_exit_irqoff, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 61128b48c503..82d0bc3a4d52 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -109,14 +109,21 @@ struct nested_vmx { * to guest memory during VM exit. */ struct vmcs12 *cached_shadow_vmcs12; + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. */ - bool need_vmcs12_sync; + bool need_vmcs12_to_shadow_sync; bool dirty_vmcs12; /* + * Indicates lazily loaded guest state has not yet been decached from + * vmcs02. + */ + bool need_sync_vmcs02_to_vmcs12_rare; + + /* * vmcs02 has been initialized, i.e. state that is constant for * vmcs02 has been written to the backing VMCS. Initialization * is delayed until L1 actually attempts to run a nested VM. @@ -180,14 +187,24 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; u8 msr_bitmap_mode; + + /* + * If true, host state has been stored in vmx->loaded_vmcs for + * the CPU registers that only need to be switched when transitioning + * to/from the kernel, and the registers have been loaded with guest + * values. If false, host state is loaded in the CPU registers + * and vmx->loaded_vmcs->host_state is invalid. + */ + bool guest_state_loaded; + u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; + struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; - bool guest_msrs_dirty; - unsigned long host_idt_base; + bool guest_msrs_ready; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -195,21 +212,15 @@ struct vcpu_vmx { u64 spec_ctrl; - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; u32 secondary_exec_control; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. loaded_cpu_state points - * to the VMCS whose state is loaded into the CPU registers that only - * need to be switched when transitioning to/from the kernel; a NULL - * value indicates that host state is loaded. + * guest (L2), it points to a different VMCS. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; - struct loaded_vmcs *loaded_cpu_state; struct msr_autoload { struct vmx_msrs guest; @@ -260,8 +271,6 @@ struct vcpu_vmx { unsigned long host_debugctlmsr; - u64 msr_ia32_power_ctl; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included @@ -292,12 +301,14 @@ struct kvm_vmx { }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void vmx_vcpu_put(struct kvm_vcpu *vcpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -376,69 +387,31 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); -} - -static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_ENTRY_CONTROLS, val); - vmx->vm_entry_controls_shadow = val; -} - -static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_entry_controls_shadow != val) - vm_entry_controls_init(vmx, val); -} - -static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_entry_controls_shadow; -} - -static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); -} - -static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); -} - -static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); -} - -static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_EXIT_CONTROLS, val); - vmx->vm_exit_controls_shadow = val; -} - -static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_exit_controls_shadow != val) - vm_exit_controls_init(vmx, val); -} - -static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_exit_controls_shadow; -} - -static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); -} - -static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +#define BUILD_CONTROLS_SHADOW(lname, uname) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write32(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return vmx->loaded_vmcs->controls_shadow.lname; \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { @@ -468,6 +441,7 @@ static inline u32 vmx_vmexit_ctrl(void) } u32 vmx_exec_control(struct vcpu_vmx *vmx); +u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9857992d4e58..4a0b74ecd1de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -67,6 +67,7 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/intel_pt.h> +#include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -716,7 +717,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) + if (!is_pae_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -959,8 +960,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu) && (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) return 1; - else if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + else if (is_pae_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); @@ -1173,7 +1174,28 @@ static u32 emulated_msrs[] = { MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + /* + * The following list leaves out MSRs whose values are determined + * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. + * We always support the "true" VMX control MSRs, even if the host + * processor does not, so I am putting these registers here rather + * than in msrs_to_save. + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, }; static unsigned num_emulated_msrs; @@ -1209,11 +1231,12 @@ static u32 msr_based_features[] = { static unsigned int num_msr_based_features; -u64 kvm_get_arch_capabilities(void) +static u64 kvm_get_arch_capabilities(void) { - u64 data; + u64 data = 0; - rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* * If we're doing cache flushes (either "always" or "cond") @@ -1229,7 +1252,6 @@ u64 kvm_get_arch_capabilities(void) return data; } -EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { @@ -1554,7 +1576,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) vcpu->arch.tsc_always_catchup = 1; return 0; } else { - WARN(1, "user requested TSC rate below hardware speed\n"); + pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); return -1; } } @@ -1564,8 +1586,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) user_tsc_khz, tsc_khz); if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { - WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); + pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); return -1; } @@ -1728,7 +1750,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); + ns = ktime_get_boottime_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -2070,7 +2092,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; + return ktime_get_boottime_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2086,7 +2108,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boot_ns() + ka->kvmclock_offset; + ret = ktime_get_boottime_ns() + ka->kvmclock_offset; put_cpu(); @@ -2185,7 +2207,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); + kernel_ns = ktime_get_boottime_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2544,13 +2566,24 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && + ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + return 1; + vcpu->arch.ia32_misc_enable_msr = data; + kvm_update_cpuid(vcpu); + } else { + vcpu->arch.ia32_misc_enable_msr = data; + } break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_POWER_CTL: + vcpu->arch.msr_ia32_power_ctl = data; + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2625,6 +2658,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; + case MSR_KVM_POLL_CONTROL: + /* only enable bit supported */ + if (data & (-1ULL << 1)) + return 1; + + vcpu->arch.msr_kvm_poll_control = data; + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: @@ -2802,6 +2843,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_POWER_CTL: + msr_info->data = vcpu->arch.msr_ia32_power_ctl; + break; case MSR_IA32_TSC: msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; break; @@ -2874,6 +2918,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_PV_EOI_EN: msr_info->data = vcpu->arch.pv_eoi.msr_val; break; + case MSR_KVM_POLL_CONTROL: + msr_info->data = vcpu->arch.msr_kvm_poll_control; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -3083,6 +3130,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -3095,7 +3143,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -4612,6 +4661,8 @@ split_irqchip_unlock: kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -4926,6 +4977,9 @@ set_identity_unlock: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } + case KVM_SET_PMU_EVENT_FILTER: + r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -6378,7 +6432,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; @@ -6705,7 +6759,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm_vcpu *vcpu; int cpu; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); @@ -6731,7 +6785,7 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } #endif @@ -6782,17 +6836,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) smp_call_function_single(cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) + if (vcpu->cpu != raw_smp_processor_id()) send_ipi = 1; } } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -6907,35 +6961,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask); -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -6944,12 +6969,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -7032,8 +7057,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_set_mmio_spte_mask(); - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, @@ -7172,6 +7195,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } +static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +{ + struct kvm_vcpu *target = NULL; + struct kvm_apic_map *map; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + + rcu_read_unlock(); + + if (target) + kvm_vcpu_yield_to(target); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -7218,6 +7258,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; + case KVM_HC_SCHED_YIELD: + kvm_sched_yield(vcpu->kvm, a0); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -7950,9 +7994,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - wait_lapic_expire(vcpu); guest_enter_irqoff(); fpregs_assert_state_consistent(); @@ -8001,13 +8042,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_before_interrupt(vcpu); - kvm_x86_ops->handle_external_intr(vcpu); - kvm_after_interrupt(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu); + /* + * Consume any pending interrupts, including the possible source of + * VM-Exit on SVM and any ticks that occur between VM-Exit and now. + * An instruction is required after local_irq_enable() to fully unblock + * interrupts on processors that implement an interrupt shadow, the + * stat.exits increment will do nicely. + */ + kvm_before_interrupt(vcpu); + local_irq_enable(); ++vcpu->stat.exits; + local_irq_disable(); + kvm_after_interrupt(vcpu); guest_exit_irqoff(); + if (lapic_in_kernel(vcpu)) { + s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; + if (delta != S64_MIN) { + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); + vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; + } + } local_irq_enable(); preempt_enable(); @@ -8593,7 +8650,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { + if (is_pae_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -8874,6 +8931,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + + /* poll control enabled by default */ + vcpu->arch.msr_kvm_poll_control = 1; + mutex_unlock(&vcpu->mutex); if (!kvmclock_periodic_sync) @@ -9015,7 +9076,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot + * elapsed; our helper function, ktime_get_boottime_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -9106,9 +9167,9 @@ void kvm_arch_hardware_unsetup(void) kvm_x86_ops->hardware_unsetup(); } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - kvm_x86_ops->check_processor_compatibility(rtn); + return kvm_x86_ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -9243,7 +9304,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); + kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); pvclock_update_vm_gtod_copy(kvm); kvm->arch.guest_can_read_msr_platform_info = true; @@ -9380,6 +9441,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_hv_destroy_vm(kvm); @@ -9788,6 +9850,36 @@ static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) sizeof(u32)); } +static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + return false; + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + (vcpu->arch.apf.send_user_only && + kvm_x86_ops->get_cpl(vcpu) == 0)) + return false; + + return true; +} + +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!lapic_in_kernel(vcpu) || + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) + return false; + + if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) + return false; + + /* + * If interrupts are off we cannot even use an artificial + * halt state. + */ + return kvm_x86_ops->interrupt_allowed(vcpu); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -9796,11 +9888,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, trace_kvm_async_pf_not_present(work->arch.token, work->gva); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + if (kvm_can_deliver_async_pf(vcpu) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -9808,6 +9897,16 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + } else { + /* + * It is not possible to deliver a paravirtualized asynchronous + * page fault, but putting the guest in an artificial halt state + * can be beneficial nevertheless: if an interrupt arrives, we + * can deliver it timely and perhaps the guest will schedule + * another process. When the instruction that triggered a page + * fault is retried, hopefully the page will be ready in the host. + */ + kvm_make_request(KVM_REQ_APF_HALT, vcpu); } } @@ -9948,6 +10047,13 @@ bool kvm_vector_hashing_enabled(void) } EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; +} +EXPORT_SYMBOL_GPL(kvm_arch_no_poll); + + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0868c5..e08a12892e8b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -139,6 +139,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } +static inline bool is_pae_paging(struct kvm_vcpu *vcpu) +{ + return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); +} + static inline u32 bit(int bitno) { return 1 << (bitno & 31); @@ -333,6 +338,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm) return kvm->arch.pause_in_guest; } +static inline bool kvm_cstate_in_guest(struct kvm *kvm) +{ + return kvm->arch.cstate_in_guest; +} + DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 1811fa4a1b1a..7c48ff4ae8d1 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -15,6 +15,7 @@ EXPORT_SYMBOL(wbinvd_on_cpu); int wbinvd_on_all_cpus(void) { - return on_each_cpu(__wbinvd, NULL, 1); + on_each_cpu(__wbinvd, NULL, 1); + return 0; } EXPORT_SYMBOL(wbinvd_on_all_cpus); diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index c6f4982d5401..39001a401eff 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -26,8 +26,6 @@ static int ptdump_curknl_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); #ifdef CONFIG_PAGE_TABLE_ISOLATION -static struct dentry *pe_curusr; - static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) { @@ -42,8 +40,6 @@ DEFINE_SHOW_ATTRIBUTE(ptdump_curusr); #endif #if defined(CONFIG_EFI) && defined(CONFIG_X86_64) -static struct dentry *pe_efi; - static int ptdump_efi_show(struct seq_file *m, void *v) { if (efi_mm.pgd) @@ -54,41 +50,24 @@ static int ptdump_efi_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump_efi); #endif -static struct dentry *dir, *pe_knl, *pe_curknl; +static struct dentry *dir; static int __init pt_dump_debug_init(void) { dir = debugfs_create_dir("page_tables", NULL); - if (!dir) - return -ENOMEM; - - pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, - &ptdump_fops); - if (!pe_knl) - goto err; - pe_curknl = debugfs_create_file("current_kernel", 0400, - dir, NULL, &ptdump_curknl_fops); - if (!pe_curknl) - goto err; + debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); + debugfs_create_file("current_kernel", 0400, dir, NULL, + &ptdump_curknl_fops); #ifdef CONFIG_PAGE_TABLE_ISOLATION - pe_curusr = debugfs_create_file("current_user", 0400, - dir, NULL, &ptdump_curusr_fops); - if (!pe_curusr) - goto err; + debugfs_create_file("current_user", 0400, dir, NULL, + &ptdump_curusr_fops); #endif - #if defined(CONFIG_EFI) && defined(CONFIG_X86_64) - pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); - if (!pe_efi) - goto err; + debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); #endif - return 0; -err: - debugfs_remove_recursive(dir); - return -ENOMEM; } static void __exit pt_dump_debug_exit(void) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 46df4c6aae46..794f364cb882 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address, * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. + * + * NB: This means that failed vsyscalls with vsyscall=none + * will have the PROT bit. This doesn't leak any + * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) error_code |= X86_PF_PROT; @@ -756,8 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address, - tsk); + force_sig_fault(signal, si_code, (void __user *)address); } /* @@ -918,7 +921,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); - force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } @@ -1015,8 +1018,6 @@ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { - struct task_struct *tsk = current; - /* Kernel mode? Handle exceptions or die: */ if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); @@ -1031,6 +1032,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + struct task_struct *tsk = current; unsigned lsb = 0; pr_err( @@ -1040,11 +1042,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; - force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk); + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static noinline void @@ -1369,16 +1371,18 @@ void do_user_addr_fault(struct pt_regs *regs, #ifdef CONFIG_X86_64 /* - * Instruction fetch faults in the vsyscall page might need - * emulation. The vsyscall page is at a high address - * (>PAGE_OFFSET), but is considered to be part of the user - * address space. + * Faults in the vsyscall page might need emulation. The + * vsyscall page is at a high address (>PAGE_OFFSET), but is + * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. + * + * PKRU never rejects instruction fetches, so we don't need + * to consider the PF_PK bit. */ - if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(regs, address)) + if (is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(hw_error_code, regs, address)) return; } #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4b6423e7bd21..e500f1df1140 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -28,9 +28,11 @@ #include "physaddr.h" -struct ioremap_mem_flags { - bool system_ram; - bool desc_other; +/* + * Descriptor controlling ioremap() behavior. + */ +struct ioremap_desc { + unsigned int flags; }; /* @@ -62,13 +64,14 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } -static bool __ioremap_check_ram(struct resource *res) +/* Does the range (or a subset of) contain normal RAM? */ +static unsigned int __ioremap_check_ram(struct resource *res) { unsigned long start_pfn, stop_pfn; unsigned long i; if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) - return false; + return 0; start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; stop_pfn = (res->end + 1) >> PAGE_SHIFT; @@ -76,28 +79,44 @@ static bool __ioremap_check_ram(struct resource *res) for (i = 0; i < (stop_pfn - start_pfn); ++i) if (pfn_valid(start_pfn + i) && !PageReserved(pfn_to_page(start_pfn + i))) - return true; + return IORES_MAP_SYSTEM_RAM; } - return false; + return 0; } -static int __ioremap_check_desc_other(struct resource *res) +/* + * In a SEV guest, NONE and RESERVED should not be mapped encrypted because + * there the whole memory is already encrypted. + */ +static unsigned int __ioremap_check_encrypted(struct resource *res) { - return (res->desc != IORES_DESC_NONE); + if (!sev_active()) + return 0; + + switch (res->desc) { + case IORES_DESC_NONE: + case IORES_DESC_RESERVED: + break; + default: + return IORES_MAP_ENCRYPTED; + } + + return 0; } -static int __ioremap_res_check(struct resource *res, void *arg) +static int __ioremap_collect_map_flags(struct resource *res, void *arg) { - struct ioremap_mem_flags *flags = arg; + struct ioremap_desc *desc = arg; - if (!flags->system_ram) - flags->system_ram = __ioremap_check_ram(res); + if (!(desc->flags & IORES_MAP_SYSTEM_RAM)) + desc->flags |= __ioremap_check_ram(res); - if (!flags->desc_other) - flags->desc_other = __ioremap_check_desc_other(res); + if (!(desc->flags & IORES_MAP_ENCRYPTED)) + desc->flags |= __ioremap_check_encrypted(res); - return flags->system_ram && flags->desc_other; + return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) == + (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)); } /* @@ -106,15 +125,15 @@ static int __ioremap_res_check(struct resource *res, void *arg) * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). */ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, - struct ioremap_mem_flags *flags) + struct ioremap_desc *desc) { u64 start, end; start = (u64)addr; end = start + size - 1; - memset(flags, 0, sizeof(*flags)); + memset(desc, 0, sizeof(struct ioremap_desc)); - walk_mem_res(start, end, flags, __ioremap_res_check); + walk_mem_res(start, end, desc, __ioremap_collect_map_flags); } /* @@ -131,15 +150,15 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, enum page_cache_mode pcm, - void *caller, bool encrypted) +static void __iomem * +__ioremap_caller(resource_size_t phys_addr, unsigned long size, + enum page_cache_mode pcm, void *caller, bool encrypted) { unsigned long offset, vaddr; resource_size_t last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; - struct ioremap_mem_flags mem_flags; + struct ioremap_desc io_desc; struct vm_struct *area; enum page_cache_mode new_pcm; pgprot_t prot; @@ -158,12 +177,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - __ioremap_check_mem(phys_addr, size, &mem_flags); + __ioremap_check_mem(phys_addr, size, &io_desc); /* * Don't allow anybody to remap normal RAM that we're using.. */ - if (mem_flags.system_ram) { + if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", &phys_addr, &last_addr); return NULL; @@ -201,7 +220,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * resulting mapping. */ prot = PAGE_KERNEL_IO; - if ((sev_active() && mem_flags.desc_other) || encrypted) + if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) prot = pgprot_encrypted(prot); switch (pcm) { diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index dddcd2a1afdb..e2b0e2ac07bb 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -70,6 +70,19 @@ struct sme_populate_pgd_data { unsigned long vaddr_end; }; +/* + * This work area lives in the .init.scratch section, which lives outside of + * the kernel proper. It is sized to hold the intermediate copy buffer and + * more than enough pagetable pages. + * + * By using this section, the kernel can be encrypted in place and it + * avoids any possibility of boot parameters or initramfs images being + * placed such that the in-place encryption logic overwrites them. This + * section is 2MB aligned to allow for simple pagetable setup using only + * PMD entries (see vmlinux.lds.S). + */ +static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.init.scratch); + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off"; @@ -311,8 +324,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp) } #endif - /* Set the encryption workarea to be immediately after the kernel */ - workarea_start = kernel_end; + /* + * We're running identity mapped, so we must obtain the address to the + * SME encryption workarea using rip-relative addressing. + */ + asm ("lea sme_workarea(%%rip), %0" + : "=r" (workarea_start) + : "p" (sme_workarea)); /* * Calculate required number of workarea bytes needed: diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 0d1c47cbbdd6..895fb7a9294d 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -912,7 +912,7 @@ void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, ret = mpx_unmap_tables(mm, start, end); if (ret) - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } /* MPX cannot handle addresses above 47 bits yet. */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1f67b1e15bf6..44816ff6411f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -13,33 +13,17 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) - #ifdef CONFIG_HIGHPTE -#define PGALLOC_USER_GFP __GFP_HIGHMEM +#define PGTABLE_HIGHMEM __GFP_HIGHMEM #else -#define PGALLOC_USER_GFP 0 +#define PGTABLE_HIGHMEM 0 #endif -gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); -} +gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *pte; - - pte = alloc_pages(__userpte_alloc_gfp, 0); - if (!pte) - return NULL; - if (!pgtable_page_ctor(pte)) { - __free_page(pte); - return NULL; - } - return pte; + return __pte_alloc_one(mm, __userpte_alloc_gfp); } static int __init setup_userpte(char *arg) @@ -235,7 +219,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; - gfp_t gfp = PGALLOC_GFP; + gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; @@ -399,14 +383,14 @@ static inline pgd_t *_pgd_alloc(void) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_pages(PGALLOC_GFP, + return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate * a 32-byte slab for pgd to save memory space. */ - return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); + return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } static inline void _pgd_free(pgd_t *pgd) @@ -424,7 +408,8 @@ void __init pgd_cache_init(void) static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, + PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 91f6db92554c..4de9704c4aaf 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -712,7 +712,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, } /* - * See Documentation/x86/tlb.txt for details. We choose 33 + * See Documentation/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index b29e82f190c7..393d251798c0 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -253,13 +253,14 @@ static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk, /* dst = src */ static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[], const u8 src[], bool dstk, - bool sstk, u8 **pprog) + bool sstk, u8 **pprog, + const struct bpf_prog_aux *aux) { emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog); if (is64) /* complete 8 byte move */ emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog); - else + else if (!aux->verifier_zext) /* zero out high 4 bytes */ emit_ia32_mov_i(dst_hi, 0, dstk, pprog); } @@ -313,7 +314,8 @@ static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk, } static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val, - bool dstk, u8 **pprog) + bool dstk, u8 **pprog, + const struct bpf_prog_aux *aux) { u8 *prog = *pprog; int cnt = 0; @@ -334,12 +336,14 @@ static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val, */ EMIT2(0x0F, 0xB7); EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + if (!aux->verifier_zext) + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); break; case 32: - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + if (!aux->verifier_zext) + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); break; case 64: /* nop */ @@ -358,7 +362,8 @@ static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val, } static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val, - bool dstk, u8 **pprog) + bool dstk, u8 **pprog, + const struct bpf_prog_aux *aux) { u8 *prog = *pprog; int cnt = 0; @@ -380,16 +385,18 @@ static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val, EMIT2(0x0F, 0xB7); EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + if (!aux->verifier_zext) + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); break; case 32: /* Emit 'bswap eax' to swap lower 4 bytes */ EMIT1(0x0F); EMIT1(add_1reg(0xC8, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + if (!aux->verifier_zext) + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); break; case 64: /* Emit 'bswap eax' to swap lower 4 bytes */ @@ -569,7 +576,7 @@ static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op, static inline void emit_ia32_alu_r64(const bool is64, const u8 op, const u8 dst[], const u8 src[], bool dstk, bool sstk, - u8 **pprog) + u8 **pprog, const struct bpf_prog_aux *aux) { u8 *prog = *pprog; @@ -577,7 +584,7 @@ static inline void emit_ia32_alu_r64(const bool is64, const u8 op, if (is64) emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk, &prog); - else + else if (!aux->verifier_zext) emit_ia32_mov_i(dst_hi, 0, dstk, &prog); *pprog = prog; } @@ -668,7 +675,8 @@ static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op, /* ALU operation (64 bit) */ static inline void emit_ia32_alu_i64(const bool is64, const u8 op, const u8 dst[], const u32 val, - bool dstk, u8 **pprog) + bool dstk, u8 **pprog, + const struct bpf_prog_aux *aux) { u8 *prog = *pprog; u32 hi = 0; @@ -679,7 +687,7 @@ static inline void emit_ia32_alu_i64(const bool is64, const u8 op, emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog); if (is64) emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog); - else + else if (!aux->verifier_zext) emit_ia32_mov_i(dst_hi, 0, dstk, &prog); *pprog = prog; @@ -724,9 +732,6 @@ static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[], { u8 *prog = *pprog; int cnt = 0; - static int jmp_label1 = -1; - static int jmp_label2 = -1; - static int jmp_label3 = -1; u8 dreg_lo = dstk ? IA32_EAX : dst_lo; u8 dreg_hi = dstk ? IA32_EDX : dst_hi; @@ -745,78 +750,22 @@ static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[], /* mov ecx,src_lo */ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); - /* cmp ecx,32 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); - /* Jumps when >= 32 */ - if (is_imm8(jmp_label(jmp_label1, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); - - /* < 32 */ - /* shl dreg_hi,cl */ - EMIT2(0xD3, add_1reg(0xE0, dreg_hi)); - /* mov ebx,dreg_lo */ - EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* shld dreg_hi,dreg_lo,cl */ + EMIT3(0x0F, 0xA5, add_2reg(0xC0, dreg_hi, dreg_lo)); /* shl dreg_lo,cl */ EMIT2(0xD3, add_1reg(0xE0, dreg_lo)); - /* IA32_ECX = -IA32_ECX + 32 */ - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + /* if ecx >= 32, mov dreg_lo into dreg_hi and clear dreg_lo */ - /* shr ebx,cl */ - EMIT2(0xD3, add_1reg(0xE8, IA32_EBX)); - /* or dreg_hi,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX)); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 32 */ - if (jmp_label1 == -1) - jmp_label1 = cnt; - - /* cmp ecx,64 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); - /* Jumps when >= 64 */ - if (is_imm8(jmp_label(jmp_label2, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* skip the next two instructions (4 bytes) when < 32 */ + EMIT2(IA32_JB, 4); - /* >= 32 && < 64 */ - /* sub ecx,32 */ - EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); - /* shl dreg_lo,cl */ - EMIT2(0xD3, add_1reg(0xE0, dreg_lo)); /* mov dreg_hi,dreg_lo */ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo)); - - /* xor dreg_lo,dreg_lo */ - EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 64 */ - if (jmp_label2 == -1) - jmp_label2 = cnt; /* xor dreg_lo,dreg_lo */ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); - - if (jmp_label3 == -1) - jmp_label3 = cnt; if (dstk) { /* mov dword ptr [ebp+off],dreg_lo */ @@ -836,9 +785,6 @@ static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[], { u8 *prog = *pprog; int cnt = 0; - static int jmp_label1 = -1; - static int jmp_label2 = -1; - static int jmp_label3 = -1; u8 dreg_lo = dstk ? IA32_EAX : dst_lo; u8 dreg_hi = dstk ? IA32_EDX : dst_hi; @@ -857,79 +803,23 @@ static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[], /* mov ecx,src_lo */ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); - /* cmp ecx,32 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); - /* Jumps when >= 32 */ - if (is_imm8(jmp_label(jmp_label1, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); - - /* < 32 */ - /* lshr dreg_lo,cl */ - EMIT2(0xD3, add_1reg(0xE8, dreg_lo)); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); - /* ashr dreg_hi,cl */ + /* shrd dreg_lo,dreg_hi,cl */ + EMIT3(0x0F, 0xAD, add_2reg(0xC0, dreg_lo, dreg_hi)); + /* sar dreg_hi,cl */ EMIT2(0xD3, add_1reg(0xF8, dreg_hi)); - /* IA32_ECX = -IA32_ECX + 32 */ - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 32 */ - if (jmp_label1 == -1) - jmp_label1 = cnt; + /* if ecx >= 32, mov dreg_hi to dreg_lo and set/clear dreg_hi depending on sign */ - /* cmp ecx,64 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); - /* Jumps when >= 64 */ - if (is_imm8(jmp_label(jmp_label2, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* skip the next two instructions (5 bytes) when < 32 */ + EMIT2(IA32_JB, 5); - /* >= 32 && < 64 */ - /* sub ecx,32 */ - EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); - /* ashr dreg_hi,cl */ - EMIT2(0xD3, add_1reg(0xF8, dreg_hi)); /* mov dreg_lo,dreg_hi */ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); - - /* ashr dreg_hi,imm8 */ + /* sar dreg_hi,31 */ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 64 */ - if (jmp_label2 == -1) - jmp_label2 = cnt; - /* ashr dreg_hi,imm8 */ - EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); - /* mov dreg_lo,dreg_hi */ - EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); - - if (jmp_label3 == -1) - jmp_label3 = cnt; - if (dstk) { /* mov dword ptr [ebp+off],dreg_lo */ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), @@ -948,9 +838,6 @@ static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk, { u8 *prog = *pprog; int cnt = 0; - static int jmp_label1 = -1; - static int jmp_label2 = -1; - static int jmp_label3 = -1; u8 dreg_lo = dstk ? IA32_EAX : dst_lo; u8 dreg_hi = dstk ? IA32_EDX : dst_hi; @@ -969,77 +856,23 @@ static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk, /* mov ecx,src_lo */ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); - /* cmp ecx,32 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); - /* Jumps when >= 32 */ - if (is_imm8(jmp_label(jmp_label1, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); - - /* < 32 */ - /* lshr dreg_lo,cl */ - EMIT2(0xD3, add_1reg(0xE8, dreg_lo)); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shrd dreg_lo,dreg_hi,cl */ + EMIT3(0x0F, 0xAD, add_2reg(0xC0, dreg_lo, dreg_hi)); /* shr dreg_hi,cl */ EMIT2(0xD3, add_1reg(0xE8, dreg_hi)); - /* IA32_ECX = -IA32_ECX + 32 */ - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); - - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + /* if ecx >= 32, mov dreg_hi to dreg_lo and clear dreg_hi */ - /* >= 32 */ - if (jmp_label1 == -1) - jmp_label1 = cnt; - /* cmp ecx,64 */ - EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); - /* Jumps when >= 64 */ - if (is_imm8(jmp_label(jmp_label2, 2))) - EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); - else - EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* skip the next two instructions (4 bytes) when < 32 */ + EMIT2(IA32_JB, 4); - /* >= 32 && < 64 */ - /* sub ecx,32 */ - EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); - /* shr dreg_hi,cl */ - EMIT2(0xD3, add_1reg(0xE8, dreg_hi)); /* mov dreg_lo,dreg_hi */ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); /* xor dreg_hi,dreg_hi */ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); - /* goto out; */ - if (is_imm8(jmp_label(jmp_label3, 2))) - EMIT2(0xEB, jmp_label(jmp_label3, 2)); - else - EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); - - /* >= 64 */ - if (jmp_label2 == -1) - jmp_label2 = cnt; - /* xor dreg_lo,dreg_lo */ - EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); - /* xor dreg_hi,dreg_hi */ - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); - - if (jmp_label3 == -1) - jmp_label3 = cnt; - if (dstk) { /* mov dword ptr [ebp+off],dreg_lo */ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), @@ -1069,27 +902,10 @@ static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val, } /* Do LSH operation */ if (val < 32) { - /* shl dreg_hi,imm8 */ - EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val); - /* mov ebx,dreg_lo */ - EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* shld dreg_hi,dreg_lo,imm8 */ + EMIT4(0x0F, 0xA4, add_2reg(0xC0, dreg_hi, dreg_lo), val); /* shl dreg_lo,imm8 */ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val); - - /* IA32_ECX = 32 - val */ - /* mov ecx,val */ - EMIT2(0xB1, val); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shr ebx,cl */ - EMIT2(0xD3, add_1reg(0xE8, IA32_EBX)); - /* or dreg_hi,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX)); } else if (val >= 32 && val < 64) { u32 value = val - 32; @@ -1135,27 +951,10 @@ static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val, /* Do RSH operation */ if (val < 32) { - /* shr dreg_lo,imm8 */ - EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shrd dreg_lo,dreg_hi,imm8 */ + EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val); /* shr dreg_hi,imm8 */ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val); - - /* IA32_ECX = 32 - val */ - /* mov ecx,val */ - EMIT2(0xB1, val); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); } else if (val >= 32 && val < 64) { u32 value = val - 32; @@ -1200,27 +999,10 @@ static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val, } /* Do RSH operation */ if (val < 32) { - /* shr dreg_lo,imm8 */ - EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val); - /* mov ebx,dreg_hi */ - EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shrd dreg_lo,dreg_hi,imm8 */ + EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val); /* ashr dreg_hi,imm8 */ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val); - - /* IA32_ECX = 32 - val */ - /* mov ecx,val */ - EMIT2(0xB1, val); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - /* neg ecx */ - EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); - /* add ecx,32 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); - - /* shl ebx,cl */ - EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); - /* or dreg_lo,ebx */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); } else if (val >= 32 && val < 64) { u32 value = val - 32; @@ -1713,8 +1495,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_MOV | BPF_X: switch (BPF_SRC(code)) { case BPF_X: - emit_ia32_mov_r64(is64, dst, src, dstk, - sstk, &prog); + if (imm32 == 1) { + /* Special mov32 for zext. */ + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + break; + } + emit_ia32_mov_r64(is64, dst, src, dstk, sstk, + &prog, bpf_prog->aux); break; case BPF_K: /* Sign-extend immediate value to dst reg */ @@ -1754,11 +1541,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, switch (BPF_SRC(code)) { case BPF_X: emit_ia32_alu_r64(is64, BPF_OP(code), dst, - src, dstk, sstk, &prog); + src, dstk, sstk, &prog, + bpf_prog->aux); break; case BPF_K: emit_ia32_alu_i64(is64, BPF_OP(code), dst, - imm32, dstk, &prog); + imm32, dstk, &prog, + bpf_prog->aux); break; } break; @@ -1777,7 +1566,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, false, &prog); break; } - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + if (!bpf_prog->aux->verifier_zext) + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); break; case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_X: @@ -1797,7 +1587,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, &prog); break; } - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + if (!bpf_prog->aux->verifier_zext) + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); break; /* dst = dst / src(imm) */ /* dst = dst % src(imm) */ @@ -1819,7 +1610,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, &prog); break; } - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + if (!bpf_prog->aux->verifier_zext) + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); break; case BPF_ALU64 | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_X: @@ -1836,7 +1628,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32); emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk, false, &prog); - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + if (!bpf_prog->aux->verifier_zext) + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); break; /* dst = dst << imm */ case BPF_ALU64 | BPF_LSH | BPF_K: @@ -1872,7 +1665,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_NEG: emit_ia32_alu_i(is64, false, BPF_OP(code), dst_lo, 0, dstk, &prog); - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + if (!bpf_prog->aux->verifier_zext) + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); break; /* dst = ~dst (64 bit) */ case BPF_ALU64 | BPF_NEG: @@ -1892,11 +1686,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; /* dst = htole(dst) */ case BPF_ALU | BPF_END | BPF_FROM_LE: - emit_ia32_to_le_r64(dst, imm32, dstk, &prog); + emit_ia32_to_le_r64(dst, imm32, dstk, &prog, + bpf_prog->aux); break; /* dst = htobe(dst) */ case BPF_ALU | BPF_END | BPF_FROM_BE: - emit_ia32_to_be_r64(dst, imm32, dstk, &prog); + emit_ia32_to_be_r64(dst, imm32, dstk, &prog, + bpf_prog->aux); break; /* dst = imm64 */ case BPF_LD | BPF_IMM | BPF_DW: { @@ -2051,6 +1847,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_B: case BPF_H: case BPF_W: + if (!bpf_prog->aux->verifier_zext) + break; if (dstk) { EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi)); @@ -2475,6 +2273,11 @@ notyet: return proglen; } +bool bpf_jit_needs_zext(void) +{ + return true; +} + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 17185d73d649..ee6b0780bea1 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -104,24 +104,12 @@ DEFINE_SHOW_ATTRIBUTE(punit_dev_state); static struct dentry *punit_dbg_file; -static int punit_dbgfs_register(struct punit_device *punit_device) +static void punit_dbgfs_register(struct punit_device *punit_device) { - struct dentry *dev_state; - punit_dbg_file = debugfs_create_dir("punit_atom", NULL); - if (!punit_dbg_file) - return -ENXIO; - - dev_state = debugfs_create_file("dev_power_state", 0444, - punit_dbg_file, punit_device, - &punit_dev_state_fops); - if (!dev_state) { - pr_err("punit_dev_state register failed\n"); - debugfs_remove(punit_dbg_file); - return -ENXIO; - } - return 0; + debugfs_create_file("dev_power_state", 0444, punit_dbg_file, + punit_device, &punit_dev_state_fops); } static void punit_dbgfs_unregister(void) @@ -145,15 +133,12 @@ MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids); static int __init punit_atom_debug_init(void) { const struct x86_cpu_id *id; - int ret; id = x86_match_cpu(intel_punit_cpu_ids); if (!id) return -ENODEV; - ret = punit_dbgfs_register((struct punit_device *)id->driver_data); - if (ret < 0) - return ret; + punit_dbgfs_register((struct punit_device *)id->driver_data); return 0; } diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index 8d4daca81eda..c33f744b5388 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -20,7 +20,6 @@ #include <linux/moduleparam.h> #include <linux/leds.h> #include <linux/platform_device.h> -#include <linux/gpio.h> #include <linux/input.h> #include <linux/gpio_keys.h> #include <linux/dmi.h> diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index 136974ec9a90..73a3f49b4eb6 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -18,7 +18,6 @@ #include <linux/string.h> #include <linux/leds.h> #include <linux/platform_device.h> -#include <linux/gpio.h> #include <linux/input.h> #include <linux/gpio_keys.h> #include <linux/dmi.h> diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 2c24d8d30436..163e1b545517 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -18,7 +18,6 @@ #include <linux/string.h> #include <linux/leds.h> #include <linux/platform_device.h> -#include <linux/gpio.h> #include <linux/input.h> #include <linux/gpio_keys.h> diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index b5420371d32d..6dd25dc5f027 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -35,7 +35,6 @@ #include <linux/types.h> struct imr_device { - struct dentry *file; bool init; struct mutex lock; int max_imr; @@ -231,13 +230,11 @@ DEFINE_SHOW_ATTRIBUTE(imr_dbgfs_state); * imr_debugfs_register - register debugfs hooks. * * @idev: pointer to imr_device structure. - * @return: 0 on success - errno on failure. */ -static int imr_debugfs_register(struct imr_device *idev) +static void imr_debugfs_register(struct imr_device *idev) { - idev->file = debugfs_create_file("imr_state", 0444, NULL, idev, - &imr_dbgfs_state_fops); - return PTR_ERR_OR_ZERO(idev->file); + debugfs_create_file("imr_state", 0444, NULL, idev, + &imr_dbgfs_state_fops); } /** @@ -582,7 +579,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { static int __init imr_init(void) { struct imr_device *idev = &imr_dev; - int ret; if (!x86_match_cpu(imr_ids) || !iosf_mbi_available()) return -ENODEV; @@ -592,9 +588,7 @@ static int __init imr_init(void) idev->init = true; mutex_init(&idev->lock); - ret = imr_debugfs_register(idev); - if (ret != 0) - pr_warn("debugfs register failed!\n"); + imr_debugfs_register(idev); imr_fixup_memmap(idev); return 0; } diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index b393eaa798ef..2e796b54cbde 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -461,31 +461,16 @@ static struct dentry *iosf_dbg; static void iosf_sideband_debug_init(void) { - struct dentry *d; - iosf_dbg = debugfs_create_dir("iosf_sb", NULL); - if (IS_ERR_OR_NULL(iosf_dbg)) - return; /* mdr */ - d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr); - if (!d) - goto cleanup; + debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr); /* mcrx */ - d = debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); - if (!d) - goto cleanup; + debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); /* mcr - initiates mailbox tranaction */ - d = debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); - if (!d) - goto cleanup; - - return; - -cleanup: - debugfs_remove_recursive(d); + debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); } static void iosf_debugfs_init(void) diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 1861a2ba0f2b..c0a502f7e3a7 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -86,7 +86,7 @@ static void __init init_pvh_bootparams(bool xen_guest) } /* - * See Documentation/x86/boot.txt. + * See Documentation/x86/boot.rst. * * Version 2.12 supports Xen entry point but we will use default x86/PC * environment (i.e. hardware_subarch 0). diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 0c7dfec4acac..20c389a91b80 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -66,7 +66,6 @@ static struct tunables tunables[] = { }; static struct dentry *tunables_dir; -static struct dentry *tunables_file; /* these correspond to the statistics printed by ptc_seq_show() */ static char *stat_description[] = { @@ -1700,18 +1699,8 @@ static int __init uv_ptc_init(void) } tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); - if (!tunables_dir) { - pr_err("unable to create debugfs directory %s\n", - UV_BAU_TUNABLES_DIR); - return -EINVAL; - } - tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, - tunables_dir, NULL, &tunables_fops); - if (!tunables_file) { - pr_err("unable to create debugfs file %s\n", - UV_BAU_TUNABLES_FILE); - return -EINVAL; - } + debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, tunables_dir, NULL, + &tunables_fops); return 0; } diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index a9c3db125222..9ad6842de4b4 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -11,3 +11,13 @@ config RAS_CEC Bear in mind that this is absolutely useless if your platform doesn't have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. + +config RAS_CEC_DEBUG + bool "CEC debugging machinery" + default n + depends on RAS_CEC + help + Add extra files to (debugfs)/ras/cec to test the correctable error + collector feature. "pfn" is a writable file that allows user to + simulate an error in a particular page frame. "array" is a read-only + file that dumps out the current state of all pages logged so far. diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index e455349e0ab5..34eda63c124b 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -111,7 +111,7 @@ static void parse_args(int argc, char **argv) int main(int argc, char **argv) { char line[BUFSIZE], sym[BUFSIZE] = "<unknown>"; - unsigned char insn_buf[16]; + unsigned char insn_buff[16]; struct insn insn; int insns = 0; int warnings = 0; @@ -130,7 +130,7 @@ int main(int argc, char **argv) } insns++; - memset(insn_buf, 0, 16); + memset(insn_buff, 0, 16); strcpy(copy, line); tab1 = strchr(copy, '\t'); if (!tab1) @@ -143,13 +143,13 @@ int main(int argc, char **argv) *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ while (s < tab2) { if (sscanf(s, "%x", &b) == 1) { - insn_buf[nb++] = (unsigned char) b; + insn_buff[nb++] = (unsigned char) b; s += 3; } else break; } /* Decode an instruction */ - insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); + insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64); insn_get_length(&insn); if (insn.length != nb) { warnings++; diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 14cf07916081..185ceba9d289 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -83,7 +83,7 @@ static void dump_insn(FILE *fp, struct insn *insn) } static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, - unsigned char *insn_buf, struct insn *insn) + unsigned char *insn_buff, struct insn *insn) { int i; @@ -96,7 +96,7 @@ static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, /* Input a decoded instruction sequence directly */ fprintf(fp, " $ echo "); for (i = 0; i < MAX_INSN_SIZE; i++) - fprintf(fp, " %02x", insn_buf[i]); + fprintf(fp, " %02x", insn_buff[i]); fprintf(fp, " | %s -i -\n", prog); if (!input_file) { @@ -124,7 +124,7 @@ fail: } /* Read given instruction sequence from the input file */ -static int read_next_insn(unsigned char *insn_buf) +static int read_next_insn(unsigned char *insn_buff) { char buf[256] = "", *tmp; int i; @@ -134,7 +134,7 @@ static int read_next_insn(unsigned char *insn_buf) return 0; for (i = 0; i < MAX_INSN_SIZE; i++) { - insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16); + insn_buff[i] = (unsigned char)strtoul(tmp, &tmp, 16); if (*tmp != ' ') break; } @@ -142,19 +142,19 @@ static int read_next_insn(unsigned char *insn_buf) return i; } -static int generate_insn(unsigned char *insn_buf) +static int generate_insn(unsigned char *insn_buff) { int i; if (input_file) - return read_next_insn(insn_buf); + return read_next_insn(insn_buff); /* Fills buffer with random binary up to MAX_INSN_SIZE */ for (i = 0; i < MAX_INSN_SIZE - 1; i += 2) - *(unsigned short *)(&insn_buf[i]) = random() & 0xffff; + *(unsigned short *)(&insn_buff[i]) = random() & 0xffff; while (i < MAX_INSN_SIZE) - insn_buf[i++] = random() & 0xff; + insn_buff[i++] = random() & 0xff; return i; } @@ -226,31 +226,31 @@ int main(int argc, char **argv) int insns = 0; int errors = 0; unsigned long i; - unsigned char insn_buf[MAX_INSN_SIZE * 2]; + unsigned char insn_buff[MAX_INSN_SIZE * 2]; parse_args(argc, argv); /* Prepare stop bytes with NOPs */ - memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); + memset(insn_buff + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); for (i = 0; i < iter_end; i++) { - if (generate_insn(insn_buf) <= 0) + if (generate_insn(insn_buff) <= 0) break; if (i < iter_start) /* Skip to given iteration number */ continue; /* Decode an instruction */ - insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); + insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64); insn_get_length(&insn); if (insn.next_byte <= insn.kaddr || insn.kaddr + MAX_INSN_SIZE < insn.next_byte) { /* Access out-of-range memory */ - dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn); + dump_stream(stderr, "Error: Found an access violation", i, insn_buff, &insn); errors++; } else if (verbose && !insn_complete(&insn)) - dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn); + dump_stream(stdout, "Info: Found an undecodable input", i, insn_buff, &insn); else if (verbose >= 2) dump_insn(stdout, &insn); insns++; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 8b4a71efe7ee..7c11c9e5d7ea 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -471,7 +471,7 @@ long sys_sigreturn(void) return PT_REGS_SYSCALL_RET(¤t->thread.regs); segfault: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -577,6 +577,6 @@ long sys_rt_sigreturn(void) return PT_REGS_SYSCALL_RET(¤t->thread.regs); segfault: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e07abefd3d26..ba5a41828e9d 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN bool "Xen guest support" depends on PARAVIRT select PARAVIRT_CLOCK + select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 13da87918b4f..532410998684 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -9,13 +9,8 @@ static struct dentry *d_xen_debug; struct dentry * __init xen_init_debugfs(void) { - if (!d_xen_debug) { + if (!d_xen_debug) d_xen_debug = debugfs_create_dir("xen", NULL); - - if (!d_xen_debug) - pr_warning("Could not create 'xen' debugfs directory\n"); - } - return d_xen_debug; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index beb44e22afdf..f6e5eeecfc69 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2700,8 +2700,7 @@ struct remap_data { struct mmu_update *mmu_update; }; -static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) { struct remap_data *rmd = data; pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot)); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95ce9b5be411..0acba2c712ab 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -817,9 +817,6 @@ static int __init xen_p2m_debugfs(void) { struct dentry *d_xen = xen_init_debugfs(); - if (d_xen == NULL) - return -ENOMEM; - d_mmu_debug = debugfs_create_dir("mmu", d_xen); debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 590fcf863006..802ee5bba66c 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -58,6 +58,7 @@ static void cpu_bringup(void) { int cpu; + cr4_init(); cpu_init(); touch_softlockup_watchdog(); preempt_disable(); @@ -251,6 +252,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 6ec1b75eabc5..ebc135bda921 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -2,6 +2,7 @@ config XTENSA def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_NO_COHERENT_DMA_MMAP if !MMU diff --git a/arch/xtensa/include/asm/flat.h b/arch/xtensa/include/asm/flat.h index b8532d7877b3..ed5870c779f9 100644 --- a/arch/xtensa/include/asm/flat.h +++ b/arch/xtensa/include/asm/flat.h @@ -4,11 +4,8 @@ #include <asm/unaligned.h> -#define flat_argvp_envp_on_stack() 0 -#define flat_old_ram_flag(flags) (flags) -#define flat_reloc_valid(reloc, size) ((reloc) <= (size)) static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, - u32 *addr, u32 *persistent) + u32 *addr) { *addr = get_unaligned((__force u32 *)rp); return 0; @@ -18,7 +15,5 @@ static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel) put_unaligned(addr, (__force u32 *)rp); return 0; } -#define flat_get_relocate_addr(rel) (rel) -#define flat_set_persistent(relval, p) 0 #endif /* __ASM_XTENSA_FLAT_H */ diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h index 30af4dc3ce7b..b52236245e51 100644 --- a/arch/xtensa/include/asm/unistd.h +++ b/arch/xtensa/include/asm/unistd.h @@ -3,6 +3,7 @@ #define _XTENSA_UNISTD_H #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #include <uapi/asm/unistd.h> #define __ARCH_WANT_NEW_STAT diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index a87f8a308cc1..65f05776d827 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -163,10 +163,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, *handle = phys_to_dma(dev, page_to_phys(page)); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - return page; - } - #ifdef CONFIG_MMU if (PageHighMem(page)) { void *p; @@ -192,9 +188,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page *page; - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - page = vaddr; - } else if (platform_vaddr_uncached(vaddr)) { + if (platform_vaddr_uncached(vaddr)) { page = virt_to_page(platform_vaddr_to_cached(vaddr)); } else { #ifdef CONFIG_MMU diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index dc22a238ed9c..fbedf2aba09d 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -270,7 +270,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3, return ret; badframe: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 5fa0ee1c8e00..25f4de729a6d 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -404,3 +404,5 @@ 431 common fsconfig sys_fsconfig 432 common fsmount sys_fsmount 433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 454d53096bc9..f060348c1b23 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -184,7 +184,7 @@ void do_unhandled(struct pt_regs *regs, unsigned long exccause) "\tEXCCAUSE is %ld\n", current->comm, task_pid_nr(current), regs->pc, exccause); - force_sig(SIGILL, current); + force_sig(SIGILL); } /* @@ -306,7 +306,7 @@ do_illegal_instruction(struct pt_regs *regs) pr_info_ratelimited("Illegal Instruction in '%s' (pid = %d, pc = %#010lx)\n", current->comm, task_pid_nr(current), regs->pc); - force_sig(SIGILL, current); + force_sig(SIGILL); } @@ -330,7 +330,7 @@ do_unaligned_user (struct pt_regs *regs) "(pid = %d, pc = %#010lx)\n", regs->excvaddr, current->comm, task_pid_nr(current), regs->pc); - force_sig_fault(SIGBUS, BUS_ADRALN, (void *) regs->excvaddr, current); + force_sig_fault(SIGBUS, BUS_ADRALN, (void *) regs->excvaddr); } #endif @@ -354,7 +354,7 @@ do_debug(struct pt_regs *regs) /* If in user mode, send SIGTRAP signal to current process */ - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 2ab0e0dcd166..f81b1478da61 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -157,7 +157,7 @@ bad_area: if (user_mode(regs)) { current->thread.bad_vaddr = address; current->thread.error_code = is_write; - force_sig_fault(SIGSEGV, code, (void *) address, current); + force_sig_fault(SIGSEGV, code, (void *) address); return; } bad_page_fault(regs, address, SIGSEGV); @@ -182,7 +182,7 @@ do_sigbus: * or user mode. */ current->thread.bad_vaddr = address; - force_sig_fault(SIGBUS, BUS_ADRERR, (void *) address, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void *) address); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) |