diff options
Diffstat (limited to 'arch')
169 files changed, 1562 insertions, 1912 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 02cb4e6a3e38..3ab446bd12ef 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -383,7 +383,13 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE config HAVE_RCU_TABLE_FREE bool -config HAVE_RCU_TABLE_INVALIDATE +config HAVE_RCU_TABLE_NO_INVALIDATE + bool + +config HAVE_MMU_GATHER_PAGE_SIZE + bool + +config HAVE_MMU_GATHER_NO_GATHER bool config ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 27c871227eee..f7b19b813a70 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -36,6 +36,7 @@ config ALPHA select ODD_RT_SIGACTION select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 + select MMU_GATHER_NO_RANGE help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/alpha/include/asm/tlb.h b/arch/alpha/include/asm/tlb.h index 8f5042b61875..4f79e331af5e 100644 --- a/arch/alpha/include/asm/tlb.h +++ b/arch/alpha/include/asm/tlb.h @@ -2,12 +2,6 @@ #ifndef _ALPHA_TLB_H #define _ALPHA_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 63ed39cbd3bd..165f268beafc 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -463,3 +463,7 @@ 532 common getppid sys_getppid # all other architectures have common numbers for new syscall, alpha # is the exception. +534 common pidfd_send_signal sys_pidfd_send_signal +535 common io_uring_setup sys_io_uring_setup +536 common io_uring_enter sys_io_uring_enter +537 common io_uring_register sys_io_uring_register diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 69bc1c9e8e50..7425bb0f2d1b 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -18,8 +18,8 @@ model = "snps,hsdk"; compatible = "snps,hsdk"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; chosen { bootargs = "earlycon=uart8250,mmio32,0xf0005000,115200n8 console=ttyS0,115200n8 debug print-fatal-signals=1"; @@ -105,7 +105,7 @@ #size-cells = <1>; interrupt-parent = <&idu_intc>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; cgu_rst: reset-controller@8a0 { compatible = "snps,hsdk-reset"; @@ -269,9 +269,10 @@ }; memory@80000000 { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; device_type = "memory"; - reg = <0x80000000 0x40000000>; /* 1 GiB */ + reg = <0x0 0x80000000 0x0 0x40000000>; /* 1 GB lowmem */ + /* 0x1 0x00000000 0x0 0x40000000>; 1 GB highmem */ }; }; diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h index a9db5f62aaf3..90cac97643a4 100644 --- a/arch/arc/include/asm/tlb.h +++ b/arch/arc/include/asm/tlb.h @@ -9,38 +9,6 @@ #ifndef _ASM_ARC_TLB_H #define _ASM_ARC_TLB_H -#define tlb_flush(tlb) \ -do { \ - if (tlb->fullmm) \ - flush_tlb_mm((tlb)->mm); \ -} while (0) - -/* - * This pair is called at time of munmap/exit to flush cache and TLB entries - * for mappings being torn down. - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$ - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range - * - * Note, read http://lkml.org/lkml/2004/1/15/6 - */ -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING -#define tlb_start_vma(tlb, vma) -#else -#define tlb_start_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while(0) -#endif - -#define tlb_end_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, ptep, address) - #include <linux/pagemap.h> #include <asm-generic/tlb.h> diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S index f230bb7092fd..b3373f5c88e0 100644 --- a/arch/arc/lib/memset-archs.S +++ b/arch/arc/lib/memset-archs.S @@ -30,10 +30,10 @@ #else -.macro PREALLOC_INSTR +.macro PREALLOC_INSTR reg, off .endm -.macro PREFETCHW_INSTR +.macro PREFETCHW_INSTR reg, off .endm #endif diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 4135abec3fb0..63e6e6504699 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -113,10 +113,24 @@ static void read_decode_cache_bcr_arcv2(int cpu) } READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); - if (cbcr.c) + if (cbcr.c) { ioc_exists = 1; - else + + /* + * As for today we don't support both IOC and ZONE_HIGHMEM enabled + * simultaneously. This happens because as of today IOC aperture covers + * only ZONE_NORMAL (low mem) and any dma transactions outside this + * region won't be HW coherent. + * If we want to use both IOC and ZONE_HIGHMEM we can use + * bounce_buffer to handle dma transactions to HIGHMEM. + * Also it is possible to modify dma_direct cache ops or increase IOC + * aperture size if we are planning to use HIGHMEM without PAE. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) || is_pae40_enabled()) + ioc_enable = 0; + } else { ioc_enable = 0; + } /* HS 2.0 didn't have AUX_VOL */ if (cpuinfo_arc700[cpu].core.family > 0x51) { @@ -1158,19 +1172,6 @@ noinline void __init arc_ioc_setup(void) if (!ioc_enable) return; - /* - * As for today we don't support both IOC and ZONE_HIGHMEM enabled - * simultaneously. This happens because as of today IOC aperture covers - * only ZONE_NORMAL (low mem) and any dma transactions outside this - * region won't be HW coherent. - * If we want to use both IOC and ZONE_HIGHMEM we can use - * bounce_buffer to handle dma transactions to HIGHMEM. - * Also it is possible to modify dma_direct cache ops or increase IOC - * aperture size if we are planning to use HIGHMEM without PAE. - */ - if (IS_ENABLED(CONFIG_HIGHMEM)) - panic("IOC and HIGHMEM can't be used simultaneously"); - /* Flush + invalidate + disable L1 dcache */ __dc_disable(); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index c8b03e1c6c2a..dc9855c4a3b4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -73,7 +73,7 @@ config ARM select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL - select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL + select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 6d6e0330930b..e388af4594a6 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -47,8 +47,8 @@ config DEBUG_WX choice prompt "Choose kernel unwinder" - default UNWINDER_ARM if AEABI && !FUNCTION_GRAPH_TRACER - default UNWINDER_FRAME_POINTER if !AEABI || FUNCTION_GRAPH_TRACER + default UNWINDER_ARM if AEABI + default UNWINDER_FRAME_POINTER if !AEABI help This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, @@ -65,7 +65,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_ARM bool "ARM EABI stack unwinder" - depends on AEABI + depends on AEABI && !FUNCTION_GRAPH_TRACER select ARM_UNWIND help This option enables stack unwinding support in the kernel diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 6c7ccb428c07..7135820f76d4 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1438,7 +1438,21 @@ ENTRY(efi_stub_entry) @ Preserve return value of efi_entry() in r4 mov r4, r0 - bl cache_clean_flush + + @ our cache maintenance code relies on CP15 barrier instructions + @ but since we arrived here with the MMU and caches configured + @ by UEFI, we must check that the CP15BEN bit is set in SCTLR. + @ Note that this bit is RAO/WI on v6 and earlier, so the ISB in + @ the enable path will be executed on v7+ only. + mrc p15, 0, r1, c1, c0, 0 @ read SCTLR + tst r1, #(1 << 5) @ CP15BEN bit set? + bne 0f + orr r1, r1, #(1 << 5) @ CP15 barrier instructions + mcr p15, 0, r1, c1, c0, 0 @ write SCTLR + ARM( .inst 0xf57ff06f @ v7+ isb ) + THUMB( isb ) + +0: bl cache_clean_flush bl cache_off @ Set parameters for booting zImage according to boot protocol diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f854148c8d7c..bc6d04a09899 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -33,271 +33,42 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> -#define MMU_GATHER_BUNDLE 8 - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE static inline void __tlb_remove_table(void *_table) { free_page_and_swap_cache((struct page *)_table); } -struct mmu_table_batch { - struct rcu_head rcu; - unsigned int nr; - void *tables[0]; -}; - -#define MAX_TABLE_BATCH \ - ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) - -extern void tlb_table_flush(struct mmu_gather *tlb); -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -#define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry) -#else -#define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry) -#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ - -/* - * TLB handling. This allows us to remove pages from the page - * tables, and efficiently handle the TLB issues. - */ -struct mmu_gather { - struct mm_struct *mm; -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - struct mmu_table_batch *batch; - unsigned int need_flush; -#endif - unsigned int fullmm; - struct vm_area_struct *vma; - unsigned long start, end; - unsigned long range_start; - unsigned long range_end; - unsigned int nr; - unsigned int max; - struct page **pages; - struct page *local[MMU_GATHER_BUNDLE]; -}; - -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); - -/* - * This is unnecessarily complex. There's three ways the TLB shootdown - * code is used: - * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region(). - * tlb->fullmm = 0, and tlb_start_vma/tlb_end_vma will be called. - * tlb->vma will be non-NULL. - * 2. Unmapping all vmas. See exit_mmap(). - * tlb->fullmm = 1, and tlb_start_vma/tlb_end_vma will be called. - * tlb->vma will be non-NULL. Additionally, page tables will be freed. - * 3. Unmapping argument pages. See shift_arg_pages(). - * tlb->fullmm = 0, but tlb_start_vma/tlb_end_vma will not be called. - * tlb->vma will be NULL. - */ -static inline void tlb_flush(struct mmu_gather *tlb) -{ - if (tlb->fullmm || !tlb->vma) - flush_tlb_mm(tlb->mm); - else if (tlb->range_end > 0) { - flush_tlb_range(tlb->vma, tlb->range_start, tlb->range_end); - tlb->range_start = TASK_SIZE; - tlb->range_end = 0; - } -} - -static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr) -{ - if (!tlb->fullmm) { - if (addr < tlb->range_start) - tlb->range_start = addr; - if (addr + PAGE_SIZE > tlb->range_end) - tlb->range_end = addr + PAGE_SIZE; - } -} - -static inline void __tlb_alloc_page(struct mmu_gather *tlb) -{ - unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - - if (addr) { - tlb->pages = (void *)addr; - tlb->max = PAGE_SIZE / sizeof(struct page *); - } -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - tlb_flush(tlb); -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb_table_flush(tlb); -#endif -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - free_pages_and_swap_cache(tlb->pages, tlb->nr); - tlb->nr = 0; - if (tlb->pages == tlb->local) - __tlb_alloc_page(tlb); -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->fullmm = !(start | (end+1)); - tlb->start = start; - tlb->end = end; - tlb->vma = NULL; - tlb->max = ARRAY_SIZE(tlb->local); - tlb->pages = tlb->local; - tlb->nr = 0; - __tlb_alloc_page(tlb); +#include <asm-generic/tlb.h> -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb->batch = NULL; +#ifndef CONFIG_HAVE_RCU_TABLE_FREE +#define tlb_remove_table(tlb, entry) tlb_remove_page(tlb, entry) #endif -} - -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->range_start = start; - tlb->range_end = end; - } - - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - if (tlb->pages != tlb->local) - free_pages((unsigned long)tlb->pages, 0); -} - -/* - * Memorize the range for the TLB flush. - */ -static inline void -tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) -{ - tlb_add_flush(tlb, addr); -} - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) -/* - * In the case of tlb vma handling, we can optimise these away in the - * case where we're doing a full MM flush. When we're doing a munmap, - * the vmas are adjusted to only cover the region to be torn down. - */ -static inline void -tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) { - flush_cache_range(vma, vma->vm_start, vma->vm_end); - tlb->vma = vma; - tlb->range_start = TASK_SIZE; - tlb->range_end = 0; - } -} static inline void -tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) - tlb_flush(tlb); -} - -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->pages[tlb->nr++] = page; - VM_WARN_ON(tlb->nr > tlb->max); - if (tlb->nr == tlb->max) - return true; - return false; -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, - unsigned long addr) +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { pgtable_page_dtor(pte); -#ifdef CONFIG_ARM_LPAE - tlb_add_flush(tlb, addr); -#else +#ifndef CONFIG_ARM_LPAE /* * With the classic ARM MMU, a pte page has two corresponding pmd * entries, each covering 1MB. */ - addr &= PMD_MASK; - tlb_add_flush(tlb, addr + SZ_1M - PAGE_SIZE); - tlb_add_flush(tlb, addr + SZ_1M); + addr = (addr & PMD_MASK) + SZ_1M; + __tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE); #endif - tlb_remove_entry(tlb, pte); -} - -static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, - unsigned long addr) -{ -#ifdef CONFIG_ARM_LPAE - tlb_add_flush(tlb, addr); - tlb_remove_entry(tlb, virt_to_page(pmdp)); -#endif + tlb_remove_table(tlb, pte); } static inline void -tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) -{ - tlb_add_flush(tlb, addr); -} - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) -#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) - -#define tlb_migrate_finish(mm) do { } while (0) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) +__pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { -} - -static inline void tlb_flush_remove_tables(struct mm_struct *mm) -{ -} +#ifdef CONFIG_ARM_LPAE + struct page *page = virt_to_page(pmdp); -static inline void tlb_flush_remove_tables_local(void *arg) -{ + tlb_remove_table(tlb, page); +#endif } #endif /* CONFIG_MMU */ diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index c08d2d890f7b..b38bbd011b35 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -133,9 +133,9 @@ __secondary_data: */ .text __after_proc_init: -#ifdef CONFIG_ARM_MPU M_CLASS(movw r12, #:lower16:BASEADDR_V7M_SCB) M_CLASS(movt r12, #:upper16:BASEADDR_V7M_SCB) +#ifdef CONFIG_ARM_MPU M_CLASS(ldr r3, [r12, 0x50]) AR_CLASS(mrc p15, 0, r3, c0, c1, 4) @ Read ID_MMFR0 and r3, r3, #(MMFR0_PMSA) @ PMSA field diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 76bb8de6bf6b..be5edfdde558 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -549,8 +549,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) int ret; /* - * Increment event counter and perform fixup for the pre-signal - * frame. + * Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index a56e7c856ab5..86870f40f9a0 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -115,8 +115,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, * running on another CPU? For now, ignore it as we * can't guarantee we won't explode. */ - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; return; #else frame.fp = thread_saved_fp(tsk); @@ -134,8 +132,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, } walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) @@ -153,8 +149,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) frame.pc = regs->ARM_pc; walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 9016f4081bb9..0393917eaa57 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -437,3 +437,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c62b9db2b5e8..d81adca1b04d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -149,7 +149,6 @@ config ARM64 select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RCU_TABLE_FREE - select HAVE_RCU_TABLE_INVALIDATE select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index cccb83ad7fa8..c7e1a7837706 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -30,8 +30,8 @@ do { \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ -"2: stlxr %w3, %w0, %2\n" \ -" cbnz %w3, 1b\n" \ +"2: stlxr %w0, %w3, %2\n" \ +" cbnz %w0, 1b\n" \ " dmb ish\n" \ "3:\n" \ " .pushsection .fixup,\"ax\"\n" \ @@ -57,23 +57,23 @@ arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("mov %w0, %w4", + __futex_atomic_op("mov %w3, %w4", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_ADD: - __futex_atomic_op("add %w0, %w1, %w4", + __futex_atomic_op("add %w3, %w1, %w4", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_OR: - __futex_atomic_op("orr %w0, %w1, %w4", + __futex_atomic_op("orr %w3, %w1, %w4", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_ANDN: - __futex_atomic_op("and %w0, %w1, %w4", + __futex_atomic_op("and %w3, %w1, %w4", ret, oldval, uaddr, tmp, ~oparg); break; case FUTEX_OP_XOR: - __futex_atomic_op("eor %w0, %w1, %w4", + __futex_atomic_op("eor %w3, %w1, %w4", ret, oldval, uaddr, tmp, oparg); break; default: diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 905e1bb0e7bd..cd9f4e9d04d3 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -73,4 +73,9 @@ static inline bool is_forbidden_offset_for_adrp(void *place) struct plt_entry get_plt_entry(u64 dst, void *pc); bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b); +static inline bool plt_entry_is_initialized(const struct plt_entry *e) +{ + return e->adrp || e->add || e->br; +} + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 106fdc951b6e..37603b5616a5 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -27,6 +27,7 @@ static inline void __tlb_remove_table(void *_table) free_page_and_swap_cache((struct page *)_table); } +#define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d1dd93436e1e..f2a83ff6b73c 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 424 +#define __NR_compat_syscalls 428 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 5590f2623690..23f1a44acada 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -866,6 +866,14 @@ __SYSCALL(__NR_rt_sigtimedwait_time64, compat_sys_rt_sigtimedwait_time64) __SYSCALL(__NR_futex_time64, sys_futex) #define __NR_sched_rr_get_interval_time64 423 __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8e4431a8821f..65a51331088e 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -103,12 +103,16 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * to be revisited if support for multiple ftrace entry points * is added in the future, but for now, the pr_err() below * deals with a theoretical issue only. + * + * Note that PLTs are place relative, and plt_entries_equal() + * checks whether they point to the same target. Here, we need + * to check if the actual opcodes are in fact identical, + * regardless of the offset in memory so use memcmp() instead. */ trampoline = get_plt_entry(addr, mod->arch.ftrace_trampoline); - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &trampoline)) { - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &(struct plt_entry){})) { + if (memcmp(mod->arch.ftrace_trampoline, &trampoline, + sizeof(trampoline))) { + if (plt_entry_is_initialized(mod->arch.ftrace_trampoline)) { pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); return -EINVAL; } diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d908b5e9e949..b00ec7d483d1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -140,8 +140,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) #endif walk_stackframe(current, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_regs); @@ -172,8 +170,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, #endif walk_stackframe(tsk, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; put_task_stack(tsk); } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8ad119c3f665..29755989f616 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -102,10 +102,16 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - int skip; + int skip = 0; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + if (regs) { + if (user_mode(regs)) + return; + skip = 1; + } + if (!tsk) tsk = current; @@ -126,7 +132,6 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.graph = 0; #endif - skip = !!regs; printk("Call trace:\n"); do { /* skip until specified stack frame */ @@ -176,15 +181,13 @@ static int __die(const char *str, int err, struct pt_regs *regs) return ret; print_modules(); - __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), end_of_stack(tsk)); + show_regs(regs); - if (!user_mode(regs)) { - dump_backtrace(regs, tsk); + if (!user_mode(regs)) dump_instr(KERN_EMERG, regs); - } return ret; } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6bc135042f5e..7cae155e81a5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -363,7 +363,7 @@ void __init arm64_memblock_init(void) * Otherwise, this is a no-op */ u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_size); + u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index ed92b5840c0a..eeb0471268a0 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -20,6 +20,7 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select ARCH_NO_COHERENT_DMA_MMAP + select MMU_GATHER_NO_RANGE if MMU config MMU def_bool n diff --git a/arch/c6x/include/asm/tlb.h b/arch/c6x/include/asm/tlb.h index 34525dea1356..240ba0febb57 100644 --- a/arch/c6x/include/asm/tlb.h +++ b/arch/c6x/include/asm/tlb.h @@ -2,8 +2,6 @@ #ifndef _ASM_C6X_TLB_H #define _ASM_C6X_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif /* _ASM_C6X_TLB_H */ diff --git a/arch/h8300/include/asm/tlb.h b/arch/h8300/include/asm/tlb.h index 98f344279904..d8201ca31206 100644 --- a/arch/h8300/include/asm/tlb.h +++ b/arch/h8300/include/asm/tlb.h @@ -2,8 +2,6 @@ #ifndef __H8300_TLB_H__ #define __H8300_TLB_H__ -#define tlb_flush(tlb) do { } while (0) - #include <asm-generic/tlb.h> #endif diff --git a/arch/hexagon/include/asm/tlb.h b/arch/hexagon/include/asm/tlb.h index 2f00772cc08a..f71c4ba83614 100644 --- a/arch/hexagon/include/asm/tlb.h +++ b/arch/hexagon/include/asm/tlb.h @@ -22,18 +22,6 @@ #include <linux/pagemap.h> #include <asm/tlbflush.h> -/* - * We don't need any special per-pte or per-vma handling... - */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it fills up - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h index 5133739966bc..beae261fbcb4 100644 --- a/arch/ia64/include/asm/machvec.h +++ b/arch/ia64/include/asm/machvec.h @@ -30,7 +30,6 @@ typedef void ia64_mv_irq_init_t (void); typedef void ia64_mv_send_ipi_t (int, int, int, int); typedef void ia64_mv_timer_interrupt_t (int, void *); typedef void ia64_mv_global_tlb_purge_t (struct mm_struct *, unsigned long, unsigned long, unsigned long); -typedef void ia64_mv_tlb_migrate_finish_t (struct mm_struct *); typedef u8 ia64_mv_irq_to_vector (int); typedef unsigned int ia64_mv_local_vector_to_irq (u8); typedef char *ia64_mv_pci_get_legacy_mem_t (struct pci_bus *); @@ -80,11 +79,6 @@ machvec_noop (void) } static inline void -machvec_noop_mm (struct mm_struct *mm) -{ -} - -static inline void machvec_noop_task (struct task_struct *task) { } @@ -96,7 +90,6 @@ machvec_noop_bus (struct pci_bus *bus) extern void machvec_setup (char **); extern void machvec_timer_interrupt (int, void *); -extern void machvec_tlb_migrate_finish (struct mm_struct *); # if defined (CONFIG_IA64_HP_SIM) # include <asm/machvec_hpsim.h> @@ -124,7 +117,6 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *); # define platform_send_ipi ia64_mv.send_ipi # define platform_timer_interrupt ia64_mv.timer_interrupt # define platform_global_tlb_purge ia64_mv.global_tlb_purge -# define platform_tlb_migrate_finish ia64_mv.tlb_migrate_finish # define platform_dma_init ia64_mv.dma_init # define platform_dma_get_ops ia64_mv.dma_get_ops # define platform_irq_to_vector ia64_mv.irq_to_vector @@ -167,7 +159,6 @@ struct ia64_machine_vector { ia64_mv_send_ipi_t *send_ipi; ia64_mv_timer_interrupt_t *timer_interrupt; ia64_mv_global_tlb_purge_t *global_tlb_purge; - ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish; ia64_mv_dma_init *dma_init; ia64_mv_dma_get_ops *dma_get_ops; ia64_mv_irq_to_vector *irq_to_vector; @@ -206,7 +197,6 @@ struct ia64_machine_vector { platform_send_ipi, \ platform_timer_interrupt, \ platform_global_tlb_purge, \ - platform_tlb_migrate_finish, \ platform_dma_init, \ platform_dma_get_ops, \ platform_irq_to_vector, \ @@ -270,9 +260,6 @@ extern const struct dma_map_ops *dma_get_ops(struct device *); #ifndef platform_global_tlb_purge # define platform_global_tlb_purge ia64_global_tlb_purge /* default to architected version */ #endif -#ifndef platform_tlb_migrate_finish -# define platform_tlb_migrate_finish machvec_noop_mm -#endif #ifndef platform_kernel_launch_event # define platform_kernel_launch_event machvec_noop #endif diff --git a/arch/ia64/include/asm/machvec_sn2.h b/arch/ia64/include/asm/machvec_sn2.h index b5153d300289..a243e4fb4877 100644 --- a/arch/ia64/include/asm/machvec_sn2.h +++ b/arch/ia64/include/asm/machvec_sn2.h @@ -34,7 +34,6 @@ extern ia64_mv_irq_init_t sn_irq_init; extern ia64_mv_send_ipi_t sn2_send_IPI; extern ia64_mv_timer_interrupt_t sn_timer_interrupt; extern ia64_mv_global_tlb_purge_t sn2_global_tlb_purge; -extern ia64_mv_tlb_migrate_finish_t sn_tlb_migrate_finish; extern ia64_mv_irq_to_vector sn_irq_to_vector; extern ia64_mv_local_vector_to_irq sn_local_vector_to_irq; extern ia64_mv_pci_get_legacy_mem_t sn_pci_get_legacy_mem; @@ -77,7 +76,6 @@ extern ia64_mv_pci_fixup_bus_t sn_pci_fixup_bus; #define platform_send_ipi sn2_send_IPI #define platform_timer_interrupt sn_timer_interrupt #define platform_global_tlb_purge sn2_global_tlb_purge -#define platform_tlb_migrate_finish sn_tlb_migrate_finish #define platform_pci_fixup sn_pci_fixup #define platform_inb __sn_inb #define platform_inw __sn_inw diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 516355a774bf..86ec034ba499 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -47,263 +47,6 @@ #include <asm/tlbflush.h> #include <asm/machvec.h> -/* - * If we can't allocate a page to make a big batch of page pointers - * to work on, then just handle a few from the on-stack structure. - */ -#define IA64_GATHER_BUNDLE 8 - -struct mmu_gather { - struct mm_struct *mm; - unsigned int nr; - unsigned int max; - unsigned char fullmm; /* non-zero means full mm flush */ - unsigned char need_flush; /* really unmapped some PTEs? */ - unsigned long start, end; - unsigned long start_addr; - unsigned long end_addr; - struct page **pages; - struct page *local[IA64_GATHER_BUNDLE]; -}; - -struct ia64_tr_entry { - u64 ifa; - u64 itir; - u64 pte; - u64 rr; -}; /*Record for tr entry!*/ - -extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); -extern void ia64_ptr_entry(u64 target_mask, int slot); - -extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; - -/* - region register macros -*/ -#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) -#define RR_VE(val) (((val) & 0x0000000000000001) << 0) -#define RR_VE_MASK 0x0000000000000001L -#define RR_VE_SHIFT 0 -#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) -#define RR_PS(val) (((val) & 0x000000000000003f) << 2) -#define RR_PS_MASK 0x00000000000000fcL -#define RR_PS_SHIFT 2 -#define RR_RID_MASK 0x00000000ffffff00L -#define RR_TO_RID(val) ((val >> 8) & 0xffffff) - -static inline void -ia64_tlb_flush_mmu_tlbonly(struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - tlb->need_flush = 0; - - if (tlb->fullmm) { - /* - * Tearing down the entire address space. This happens both as a result - * of exit() and execve(). The latter case necessitates the call to - * flush_tlb_mm() here. - */ - flush_tlb_mm(tlb->mm); - } else if (unlikely (end - start >= 1024*1024*1024*1024UL - || REGION_NUMBER(start) != REGION_NUMBER(end - 1))) - { - /* - * If we flush more than a tera-byte or across regions, we're probably - * better off just flushing the entire TLB(s). This should be very rare - * and is not worth optimizing for. - */ - flush_tlb_all(); - } else { - /* - * flush_tlb_range() takes a vma instead of a mm pointer because - * some architectures want the vm_flags for ITLB/DTLB flush. - */ - struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); - - /* flush the address range from the tlb: */ - flush_tlb_range(&vma, start, end); - /* now flush the virt. page-table area mapping the address range: */ - flush_tlb_range(&vma, ia64_thash(start), ia64_thash(end)); - } - -} - -static inline void -ia64_tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - unsigned long i; - unsigned int nr; - - /* lastly, release the freed pages */ - nr = tlb->nr; - - tlb->nr = 0; - tlb->start_addr = ~0UL; - for (i = 0; i < nr; ++i) - free_page_and_swap_cache(tlb->pages[i]); -} - -/* - * Flush the TLB for address range START to END and, if not in fast mode, release the - * freed pages that where gathered up to this point. - */ -static inline void -ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - if (!tlb->need_flush) - return; - ia64_tlb_flush_mmu_tlbonly(tlb, start, end); - ia64_tlb_flush_mmu_free(tlb); -} - -static inline void __tlb_alloc_page(struct mmu_gather *tlb) -{ - unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - - if (addr) { - tlb->pages = (void *)addr; - tlb->max = PAGE_SIZE / sizeof(void *); - } -} - - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->max = ARRAY_SIZE(tlb->local); - tlb->pages = tlb->local; - tlb->nr = 0; - tlb->fullmm = !(start | (end+1)); - tlb->start = start; - tlb->end = end; - tlb->start_addr = ~0UL; -} - -/* - * Called at the end of the shootdown operation to free up any resources that were - * collected. - */ -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) - tlb->need_flush = 1; - /* - * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and - * tlb->end_addr. - */ - ia64_tlb_flush_mmu(tlb, start, end); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - if (tlb->pages != tlb->local) - free_pages((unsigned long)tlb->pages, 0); -} - -/* - * Logically, this routine frees PAGE. On MP machines, the actual freeing of the page - * must be delayed until after the TLB has been flushed (see comments at the beginning of - * this file). - */ -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - - if (!tlb->nr && tlb->pages == tlb->local) - __tlb_alloc_page(tlb); - - tlb->pages[tlb->nr++] = page; - VM_WARN_ON(tlb->nr > tlb->max); - if (tlb->nr == tlb->max) - return true; - return false; -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu_tlbonly(tlb, tlb->start_addr, tlb->end_addr); -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu_free(tlb); -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr); -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -/* - * Remove TLB entry for PTE mapped at virtual address ADDRESS. This is called for any - * PTE, not just those pointing to (normal) physical memory. - */ -static inline void -__tlb_remove_tlb_entry (struct mmu_gather *tlb, pte_t *ptep, unsigned long address) -{ - if (tlb->start_addr == ~0UL) - tlb->start_addr = address; - tlb->end_addr = address + PAGE_SIZE; -} - -#define tlb_migrate_finish(mm) platform_tlb_migrate_finish(mm) - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - -#define tlb_remove_tlb_entry(tlb, ptep, addr) \ -do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, addr); \ -} while (0) - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, address) \ -do { \ - tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep, address); \ -} while (0) - -#define pmd_free_tlb(tlb, ptep, address) \ -do { \ - tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, ptep, address); \ -} while (0) - -#define pud_free_tlb(tlb, pudp, address) \ -do { \ - tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp, address); \ -} while (0) +#include <asm-generic/tlb.h> #endif /* _ASM_IA64_TLB_H */ diff --git a/arch/ia64/include/asm/tlbflush.h b/arch/ia64/include/asm/tlbflush.h index 25e280810f6c..ceac10c4d6e2 100644 --- a/arch/ia64/include/asm/tlbflush.h +++ b/arch/ia64/include/asm/tlbflush.h @@ -14,6 +14,31 @@ #include <asm/mmu_context.h> #include <asm/page.h> +struct ia64_tr_entry { + u64 ifa; + u64 itir; + u64 pte; + u64 rr; +}; /*Record for tr entry!*/ + +extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); +extern void ia64_ptr_entry(u64 target_mask, int slot); +extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; + +/* + region register macros +*/ +#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) +#define RR_VE(val) (((val) & 0x0000000000000001) << 0) +#define RR_VE_MASK 0x0000000000000001L +#define RR_VE_SHIFT 0 +#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) +#define RR_PS(val) (((val) & 0x000000000000003f) << 2) +#define RR_PS_MASK 0x00000000000000fcL +#define RR_PS_SHIFT 2 +#define RR_RID_MASK 0x00000000ffffff00L +#define RR_TO_RID(val) ((val >> 8) & 0xffffff) + /* * Now for some TLB flushing routines. This is the kind of stuff that * can be very expensive, so try to avoid them whenever possible. diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 583a3746d70b..c9cfa760cd57 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -1058,9 +1058,7 @@ check_bugs (void) static int __init run_dmi_scan(void) { - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); return 0; } core_initcall(run_dmi_scan); diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index ab9cda5f6136..56e3d0b685e1 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -344,3 +344,7 @@ 332 common pkey_free sys_pkey_free 333 common rseq sys_rseq # 334 through 423 are reserved to sync up with other architectures +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 5fc89aabdce1..5158bd28de05 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -305,8 +305,8 @@ local_flush_tlb_all (void) ia64_srlz_i(); /* srlz.i implies srlz.d */ } -void -flush_tlb_range (struct vm_area_struct *vma, unsigned long start, +static void +__flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; @@ -343,6 +343,25 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, preempt_enable(); ia64_srlz_i(); /* srlz.i implies srlz.d */ } + +void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (unlikely(end - start >= 1024*1024*1024*1024UL + || REGION_NUMBER(start) != REGION_NUMBER(end - 1))) { + /* + * If we flush more than a tera-byte or across regions, we're + * probably better off just flushing the entire TLB(s). This + * should be very rare and is not worth optimizing for. + */ + flush_tlb_all(); + } else { + /* flush the address range from the tlb */ + __flush_tlb_range(vma, start, end); + /* flush the virt. page-table area mapping the addr range */ + __flush_tlb_range(vma, ia64_thash(start), ia64_thash(end)); + } +} EXPORT_SYMBOL(flush_tlb_range); void ia64_tlb_init(void) diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index b73b0ebf8214..b510f4f17fd4 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -120,13 +120,6 @@ void sn_migrate(struct task_struct *task) cpu_relax(); } -void sn_tlb_migrate_finish(struct mm_struct *mm) -{ - /* flush_tlb_mm is inefficient if more than 1 users of mm */ - if (mm == current->mm && mm && atomic_read(&mm->mm_users) == 1) - flush_tlb_mm(mm); -} - static void sn2_ipi_flush_all_tlb(struct mm_struct *mm) { diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index f5661f48019c..735b9679fe6f 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -28,6 +28,7 @@ config M68K select OLD_SIGSUSPEND3 select OLD_SIGACTION select ARCH_DISCARD_MEMBLOCK + select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y diff --git a/arch/m68k/include/asm/tlb.h b/arch/m68k/include/asm/tlb.h index b4b9efb6f963..3c81f6adfc8b 100644 --- a/arch/m68k/include/asm/tlb.h +++ b/arch/m68k/include/asm/tlb.h @@ -2,20 +2,6 @@ #ifndef _M68K_TLB_H #define _M68K_TLB_H -/* - * m68k doesn't need any special per-pte or - * per-vma handling.. - */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it - * fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif /* _M68K_TLB_H */ diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 125c14178979..df4ec3ec71d1 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -423,3 +423,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a7a9a9d59f65..adb179f519f9 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -41,6 +41,7 @@ config MICROBLAZE select TRACING_SUPPORT select VIRT_TO_BUS select CPU_NO_EFFICIENT_FFS + select MMU_GATHER_NO_RANGE if MMU # Endianness selection choice diff --git a/arch/microblaze/include/asm/tlb.h b/arch/microblaze/include/asm/tlb.h index 99b6ded54849..628a78ee0a72 100644 --- a/arch/microblaze/include/asm/tlb.h +++ b/arch/microblaze/include/asm/tlb.h @@ -11,16 +11,7 @@ #ifndef _ASM_MICROBLAZE_TLB_H #define _ASM_MICROBLAZE_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <linux/pagemap.h> - -#ifdef CONFIG_MMU -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) -#endif - #include <asm-generic/tlb.h> #endif /* _ASM_MICROBLAZE_TLB_H */ diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 8ee3a8c18498..4964947732af 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -429,3 +429,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 4a70c5de8c92..25a57895a3a3 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -210,12 +210,6 @@ const char *get_system_type(void) return ath79_sys_type; } -int get_c0_perfcount_int(void) -{ - return ATH79_MISC_IRQ(5); -} -EXPORT_SYMBOL_GPL(get_c0_perfcount_int); - unsigned int get_c0_compare_int(void) { return CP0_LEGACY_COMPARE_IRQ; diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h index b6823b9e94da..90f3ad76d9e0 100644 --- a/arch/mips/include/asm/tlb.h +++ b/arch/mips/include/asm/tlb.h @@ -5,23 +5,6 @@ #include <asm/cpu-features.h> #include <asm/mipsregs.h> -/* - * MIPS doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #define _UNIQUE_ENTRYHI(base, idx) \ (((base) + ((idx) << (PAGE_SHIFT + 1))) | \ (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0)) diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index f158c5894a9a..feb2653490df 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -125,7 +125,7 @@ trace_a_syscall: subu t1, v0, __NR_O32_Linux move a1, v0 bnez t1, 1f /* __NR_syscall at offset 0 */ - lw a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ + ld a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ .set pop 1: jal syscall_trace_enter diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 15f4117900ee..9392dfe33f97 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -362,3 +362,7 @@ 421 n32 rt_sigtimedwait_time64 compat_sys_rt_sigtimedwait_time64 422 n32 futex_time64 sys_futex 423 n32 sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 n32 pidfd_send_signal sys_pidfd_send_signal +425 n32 io_uring_setup sys_io_uring_setup +426 n32 io_uring_enter sys_io_uring_enter +427 n32 io_uring_register sys_io_uring_register diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c85502e67b44..cd0c8aa21fba 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -338,3 +338,7 @@ 327 n64 rseq sys_rseq 328 n64 io_pgetevents sys_io_pgetevents # 329 through 423 are reserved to sync up with other architectures +424 n64 pidfd_send_signal sys_pidfd_send_signal +425 n64 io_uring_setup sys_io_uring_setup +426 n64 io_uring_enter sys_io_uring_enter +427 n64 io_uring_register sys_io_uring_register diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 2e063d0f837e..e849e8ffe4a2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -411,3 +411,7 @@ 421 o32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 o32 futex_time64 sys_futex sys_futex 423 o32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 o32 pidfd_send_signal sys_pidfd_send_signal +425 o32 io_uring_setup sys_io_uring_setup +426 o32 io_uring_enter sys_io_uring_enter +427 o32 io_uring_register sys_io_uring_register diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 0effd3cba9a7..98bf0c222b5f 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -186,8 +186,9 @@ enum which_ebpf_reg { * separate frame pointer, so BPF_REG_10 relative accesses are * adjusted to be $sp relative. */ -int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn, - enum which_ebpf_reg w) +static int ebpf_to_mips_reg(struct jit_ctx *ctx, + const struct bpf_insn *insn, + enum which_ebpf_reg w) { int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ? insn->src_reg : insn->dst_reg; diff --git a/arch/nds32/include/asm/tlb.h b/arch/nds32/include/asm/tlb.h index b35ae5eae3ab..d5ae571c8d30 100644 --- a/arch/nds32/include/asm/tlb.h +++ b/arch/nds32/include/asm/tlb.h @@ -4,22 +4,6 @@ #ifndef __ASMNDS32_TLB_H #define __ASMNDS32_TLB_H -#define tlb_start_vma(tlb,vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb,vma) \ - do { \ - if(!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) diff --git a/arch/nds32/include/asm/tlbflush.h b/arch/nds32/include/asm/tlbflush.h index 9b411f401903..38ee769b18d8 100644 --- a/arch/nds32/include/asm/tlbflush.h +++ b/arch/nds32/include/asm/tlbflush.h @@ -42,6 +42,5 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * pte); -void tlb_migrate_finish(struct mm_struct *mm); #endif diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 56685fd45ed0..ea37394ff3ea 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -24,6 +24,7 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select ARCH_DISCARD_MEMBLOCK + select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM def_bool y diff --git a/arch/nios2/include/asm/tlb.h b/arch/nios2/include/asm/tlb.h index d3bc648e08b5..f9f2e27e32dd 100644 --- a/arch/nios2/include/asm/tlb.h +++ b/arch/nios2/include/asm/tlb.h @@ -11,22 +11,12 @@ #ifndef _ASM_NIOS2_TLB_H #define _ASM_NIOS2_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - extern void set_mmu_pid(unsigned long pid); /* - * NiosII doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for the area to be unmapped. + * NIOS32 does have flush_tlb_range(), but it lacks a limit and fallback to + * full mm invalidation. So use flush_tlb_mm() for everything. */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #include <linux/pagemap.h> #include <asm-generic/tlb.h> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 683511b8c9df..7cfb20555b10 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -36,6 +36,7 @@ config OPENRISC select OMPIC if SMP select ARCH_WANT_FRAME_POINTERS select GENERIC_IRQ_MULTI_HANDLER + select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y diff --git a/arch/openrisc/include/asm/tlb.h b/arch/openrisc/include/asm/tlb.h index fa4376a4515d..92d8a4209884 100644 --- a/arch/openrisc/include/asm/tlb.h +++ b/arch/openrisc/include/asm/tlb.h @@ -20,14 +20,10 @@ #define __ASM_OPENRISC_TLB_H__ /* - * or32 doesn't need any special per-pte or - * per-vma handling.. + * OpenRISC doesn't have an efficient flush_tlb_range() so use flush_tlb_mm() + * for everything. */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #include <linux/pagemap.h> #include <asm-generic/tlb.h> diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h index 0c881e74d8a6..8c0446b04c9e 100644 --- a/arch/parisc/include/asm/tlb.h +++ b/arch/parisc/include/asm/tlb.h @@ -2,24 +2,6 @@ #ifndef _PARISC_TLB_H #define _PARISC_TLB_H -#define tlb_flush(tlb) \ -do { if ((tlb)->fullmm) \ - flush_tlb_mm((tlb)->mm);\ -} while (0) - -#define tlb_start_vma(tlb, vma) \ -do { if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define tlb_end_vma(tlb, vma) \ -do { if (!(tlb)->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, address) \ - do { } while (0) - #include <asm-generic/tlb.h> #define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) diff --git a/arch/parisc/kernel/stacktrace.c b/arch/parisc/kernel/stacktrace.c index ec5835e83a7a..6f0b9c8d8052 100644 --- a/arch/parisc/kernel/stacktrace.c +++ b/arch/parisc/kernel/stacktrace.c @@ -29,22 +29,17 @@ static void dump_trace(struct task_struct *task, struct stack_trace *trace) } } - /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { dump_trace(current, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b26766c6647d..fe8ca623add8 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -420,3 +420,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e5dd6aafaf68..fab0bf4259c7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -211,6 +211,8 @@ config PPC select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE + select HAVE_MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 5ba131c30f6b..1bcd468ab422 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -266,6 +266,7 @@ CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_PROC_KCORE=y +CONFIG_HUGETLBFS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS=y diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 598cdcdd1355..8ddd4a91bdc1 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -352,7 +352,7 @@ static inline bool strict_kernel_rwx_enabled(void) #if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ defined (CONFIG_PPC_64K_PAGES) #define MAX_PHYSMEM_BITS 51 -#elif defined(CONFIG_SPARSEMEM) +#elif defined(CONFIG_PPC64) #define MAX_PHYSMEM_BITS 46 #endif diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e24c67d5ba75..34fba1ce27f7 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -27,8 +27,8 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +#define tlb_flush tlb_flush extern void tlb_flush(struct mmu_gather *tlb); /* Get the generic bits... */ @@ -46,22 +46,6 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ - if (!tlb->page_size) - tlb->page_size = page_size; - else if (tlb->page_size != page_size) { - if (!tlb->fullmm) - tlb_flush_mmu(tlb); - /* - * update the page size after flush for the new - * mmu_gather. - */ - tlb->page_size = page_size; - } -} - #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a5b8fbae56a0..9481a117e242 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -656,11 +656,17 @@ EXC_COMMON_BEGIN(data_access_slb_common) ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + /* HPT case, do SLB fault */ bl do_slb_fault cmpdi r3,0 bne- 1f b fast_exception_return 1: /* Error case */ +MMU_FTR_SECTION_ELSE + /* Radix case, access is outside page table range */ + li r3,-EFAULT +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) @@ -705,11 +711,17 @@ EXC_COMMON_BEGIN(instruction_access_slb_common) EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) ld r4,_NIP(r1) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + /* HPT case, do SLB fault */ bl do_slb_fault cmpdi r3,0 bne- 1f b fast_exception_return 1: /* Error case */ +MMU_FTR_SECTION_ELSE + /* Radix case, access is outside page table range */ + li r3,-EFAULT +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 48051c8977c5..e25b615e9f9e 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -851,10 +851,6 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 -#ifdef CONFIG_PPC_RTAS - li r3,0 - stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ -#endif lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l mtspr SPRN_SPRG_PGDIR, r4 @@ -941,10 +937,6 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 -#ifdef CONFIG_PPC_RTAS - li r3,0 - stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ -#endif lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l mtspr SPRN_SPRG_PGDIR, r4 diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index b33bafb8fcea..70568ccbd9fd 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -57,7 +57,7 @@ void setup_barrier_nospec(void) enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR); - if (!no_nospec) + if (!no_nospec && !cpu_mitigations_off()) enable_barrier_nospec(enable); } @@ -116,7 +116,7 @@ static int __init handle_nospectre_v2(char *p) early_param("nospectre_v2", handle_nospectre_v2); void setup_spectre_v2(void) { - if (no_spectrev2) + if (no_spectrev2 || cpu_mitigations_off()) do_btb_flush_fixups(); else btb_flush_enabled = true; @@ -300,7 +300,7 @@ void setup_stf_barrier(void) stf_enabled_flush_types = type; - if (!no_stf_barrier) + if (!no_stf_barrier && !cpu_mitigations_off()) stf_barrier_enable(enable); } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index ba404dd9ce1d..4f49e1a3594c 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -932,7 +932,7 @@ void setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - if (!no_rfi_flush) + if (!no_rfi_flush && !cpu_mitigations_off()) rfi_flush_enable(enable); } diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index b18abb0c3dae..00f5a63c8d9a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -505,3 +505,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 1e0bc5955a40..afd516b572f8 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -98,7 +98,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * can be used, r7 contains NSEC_PER_SEC. */ - lwz r5,WTOM_CLOCK_SEC(r9) + lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9) lwz r6,WTOM_CLOCK_NSEC(r9) /* We now have our offset in r5,r6. We create a fake dependency diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index f02b04973710..f100e331e69b 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -543,14 +543,14 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_tce_validate(stt, tce); if (ret != H_SUCCESS) - return ret; + goto unlock_exit; dir = iommu_tce_direction(tce); - idx = srcu_read_lock(&vcpu->kvm->srcu); - if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; goto unlock_exit; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 06964350b97a..b2b29d4f9842 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3423,7 +3423,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); - mtspr(SPRN_PSSCR, host_psscr); + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ + mtspr(SPRN_PSSCR, host_psscr | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); mtspr(SPRN_HFSCR, host_hfscr); mtspr(SPRN_CIABR, host_ciabr); mtspr(SPRN_DAWR, host_dawr); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e7a9c4f6bfca..8330f135294f 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -95,28 +95,15 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; + struct mm_iommu_table_group_mem_t *mem, *mem2; + long i, ret, locked_entries = 0, pinned = 0; unsigned int pageshift; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } + unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { ret = mm_iommu_adjust_locked_vm(mm, entries, true); if (ret) - goto unlock_exit; + return ret; locked_entries = entries; } @@ -148,17 +135,27 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + sizeof(struct vm_area_struct *); + chunk = min(chunk, entries); + for (entry = 0; entry < entries; entry += chunk) { + unsigned long n = min(entries - entry, chunk); + + ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE, mem->hpages + entry, NULL); + if (ret == n) { + pinned += n; + continue; + } + if (ret > 0) + pinned += ret; + break; + } up_read(&mm->mmap_sem); - if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); - ret = -EFAULT; - goto unlock_exit; + if (pinned != entries) { + if (!ret) + ret = -EFAULT; + goto free_exit; } pageshift = PAGE_SHIFT; @@ -183,21 +180,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } good_exit: - ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; mem->entries = entries; - *pmem = mem; - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + mutex_lock(&mem_list_mutex); -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { + /* Overlap? */ + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem2->ua + + (mem2->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + mutex_unlock(&mem_list_mutex); + goto free_exit; + } + } + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); mutex_unlock(&mem_list_mutex); + *pmem = mem; + + return 0; + +free_exit: + /* free the reference taken */ + for (i = 0; i < pinned; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + +unlock_exit: + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + return ret; } @@ -266,7 +285,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; - unsigned long entries, dev_hpa; + unsigned long unlock_entries = 0; mutex_lock(&mem_list_mutex); @@ -287,17 +306,17 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) goto unlock_exit; } + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; + /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; mm_iommu_release(mem); - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - unlock_exit: mutex_unlock(&mem_list_mutex); + mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_put); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index f29d2f118b44..5d9c3ff728c9 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -98,10 +98,20 @@ static int find_free_bat(void) return -1; } +/* + * This function calculates the size of the larger block usable to map the + * beginning of an area based on the start address and size of that area: + * - max block size is 8M on 601 and 256 on other 6xx. + * - base address must be aligned to the block size. So the maximum block size + * is identified by the lowest bit set to 1 in the base address (for instance + * if base is 0x16000000, max size is 0x02000000). + * - block size has to be a power of two. This is calculated by finding the + * highest bit set to 1. + */ static unsigned int block_size(unsigned long base, unsigned long top) { unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; - unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; return min3(max_size, 1U << base_shift, 1U << block_shift); @@ -157,7 +167,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { - int done; + unsigned long done; unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; if (__map_without_bats) { @@ -169,10 +179,10 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return __mmu_mapin_ram(base, top); done = __mmu_mapin_ram(base, border); - if (done != border - base) + if (done != border) return done; - return done + __mmu_mapin_ram(border, top); + return __mmu_mapin_ram(border, top); } void mmu_mark_initmem_nx(void) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 842b2c7e156a..50cd09b4e05d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -324,7 +324,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" - depends on PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 && HUGETLB_PAGE select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig new file mode 100644 index 000000000000..1a911ed8e772 --- /dev/null +++ b/arch/riscv/configs/rv32_defconfig @@ -0,0 +1,84 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_RV32I=y +CONFIG_SMP=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_EARLYCON_RISCV_SBI=y +CONFIG_HVC_RISCV_SBI=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_SIFIVE_PLIC=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_DEV_VIRTIO=y +CONFIG_PRINTK_TIME=y +# CONFIG_RCU_TRACE is not set diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 439dc7072e05..1ad8d093c58b 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -18,6 +18,7 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); +#define tlb_flush tlb_flush #include <asm-generic/tlb.h> static inline void tlb_flush(struct mmu_gather *tlb) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index a4b1d94371a0..4d403274c2e8 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -169,8 +169,6 @@ static bool save_trace(unsigned long pc, void *arg) void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { walk_stackframe(tsk, NULL, save_trace, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 5fd8c922e1c2..bc7b77e34d09 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -121,6 +121,14 @@ void __init setup_bootmem(void) */ memblock_reserve(reg->base, vmlinux_end - reg->base); mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET); + + /* + * Remove memblock from the end of usable area to the + * end of region + */ + if (reg->base + mem_size < end) + memblock_remove(reg->base + mem_size, + end - reg->base - mem_size); } } BUG_ON(mem_size == 0); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c9300b437195..97b555e772d7 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -158,11 +158,13 @@ config S390 select HAVE_PERF_USER_STACK_DUMP select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP + select HAVE_MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC select HAVE_NOP_MCOUNT select HAVE_OPROFILE select HAVE_PCI select HAVE_PERF_EVENTS + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c index 4cb771ba13fa..5d316fe40480 100644 --- a/arch/s390/boot/mem_detect.c +++ b/arch/s390/boot/mem_detect.c @@ -25,7 +25,7 @@ static void *mem_detect_alloc_extended(void) { unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64)); - if (IS_ENABLED(BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && INITRD_START < offset + ENTRIES_EXTENDED_MAX) offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64)); diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b31c779cf581..aa406c05a350 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -22,98 +22,39 @@ * Pages used for the page tables is a different story. FIXME: more */ -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <asm/processor.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> - -struct mmu_gather { - struct mm_struct *mm; - struct mmu_table_batch *batch; - unsigned int fullmm; - unsigned long start, end; -}; - -struct mmu_table_batch { - struct rcu_head rcu; - unsigned int nr; - void *tables[0]; -}; - -#define MAX_TABLE_BATCH \ - ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) - -extern void tlb_table_flush(struct mmu_gather *tlb); -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - tlb->batch = NULL; -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - __tlb_flush_mm_lazy(tlb->mm); -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - tlb_table_flush(tlb); -} - +void __tlb_remove_table(void *_table); +static inline void tlb_flush(struct mmu_gather *tlb); +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size); -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} +#define tlb_start_vma(tlb, vma) do { } while (0) +#define tlb_end_vma(tlb, vma) do { } while (0) -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->start = start; - tlb->end = end; - } +#define tlb_flush tlb_flush +#define pte_free_tlb pte_free_tlb +#define pmd_free_tlb pmd_free_tlb +#define p4d_free_tlb p4d_free_tlb +#define pud_free_tlb pud_free_tlb - tlb_flush_mmu(tlb); -} +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm-generic/tlb.h> /* * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. */ -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); -} - static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - return __tlb_remove_page(tlb, page); + free_page_and_swap_cache(page); + return false; } -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) +static inline void tlb_flush(struct mmu_gather *tlb) { - return tlb_remove_page(tlb, page); + __tlb_flush_mm_lazy(tlb->mm); } /* @@ -121,8 +62,17 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, * page table from the tlb. */ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, - unsigned long address) + unsigned long address) { + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_ptes = 1; + /* + * page_table_free_rcu takes care of the allocation bit masks + * of the 2K table fragments in the 4K page table page, + * then calls tlb_remove_table. + */ page_table_free_rcu(tlb, (unsigned long *) pte, address); } @@ -139,6 +89,10 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, if (mm_pmd_folded(tlb->mm)) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; tlb_remove_table(tlb, pmd); } @@ -154,6 +108,10 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, { if (mm_p4d_folded(tlb->mm)) return; + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_p4ds = 1; tlb_remove_table(tlb, p4d); } @@ -169,21 +127,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, { if (mm_pud_folded(tlb->mm)) return; + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; tlb_remove_table(tlb, pud); } -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) -#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) -#define tlb_migrate_finish(mm) do { } while (0) -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} #endif /* _S390_TLB_H */ diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 594464f2129d..0da378e2eb25 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -23,7 +23,7 @@ void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) if (flags & KERNEL_FPC) /* Save floating point control */ - asm volatile("stfpc %0" : "=m" (state->fpc)); + asm volatile("stfpc %0" : "=Q" (state->fpc)); if (!MACHINE_HAS_VX) { if (flags & KERNEL_VXR_V0V7) { diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index bdddaae96559..649135cbedd5 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/device.h> +#include <linux/cpu.h> #include <asm/nospec-branch.h> static int __init nobp_setup_early(char *str) @@ -58,7 +59,7 @@ early_param("nospectre_v2", nospectre_v2_setup_early); void __init nospec_auto_detect(void) { - if (test_facility(156)) { + if (test_facility(156) || cpu_mitigations_off()) { /* * The machine supports etokens. * Disable expolines and disable nobp. diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 460dcfba7d4e..cc9ed9787068 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -45,8 +45,6 @@ void save_stack_trace(struct stack_trace *trace) sp = current_stack_pointer(); dump_trace(save_address, trace, NULL, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -58,8 +56,6 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (tsk == current) sp = current_stack_pointer(); dump_trace(save_address_nosched, trace, tsk, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); @@ -69,7 +65,5 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) sp = kernel_stack_pointer(regs); dump_trace(save_address, trace, NULL, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_regs); diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 02579f95f391..061418f787c3 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -426,3 +426,7 @@ 421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 - sys_futex 423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register sys_io_uring_register diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index a69a0911ed0e..c475ca49cfc6 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -37,7 +37,7 @@ static inline u64 get_vtimer(void) { u64 timer; - asm volatile("stpt %0" : "=m" (timer)); + asm volatile("stpt %0" : "=Q" (timer)); return timer; } @@ -48,7 +48,7 @@ static inline void set_vtimer(u64 expires) asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ - : "=m" (timer) : "m" (expires)); + : "=Q" (timer) : "Q" (expires)); S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; S390_lowcore.last_update_timer = expires; } @@ -135,8 +135,8 @@ static int do_account_vtime(struct task_struct *tsk) #else " stck %1" /* Store current tod clock value */ #endif - : "=m" (S390_lowcore.last_update_timer), - "=m" (S390_lowcore.last_update_clock)); + : "=Q" (S390_lowcore.last_update_timer), + "=Q" (S390_lowcore.last_update_clock)); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index db6bb2f97a2c..99e06213a22b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -290,7 +290,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, tlb_remove_table(tlb, table); } -static void __tlb_remove_table(void *_table) +void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 3; void *table = (void *)((unsigned long) _table ^ mask); @@ -316,67 +316,6 @@ static void __tlb_remove_table(void *_table) } } -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - tlb->mm->context.flush_mm = 1; - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm_lazy(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_flush_mmu(tlb); -} - /* * Base infrastructure required to generate basic asces, region, segment, * and page tables that do not make use of enhanced features like EDAT1. diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8ad73cb31121..b56f908b1395 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -70,6 +70,15 @@ do { \ tlb_remove_page((tlb), (pte)); \ } while (0) +#if CONFIG_PGTABLE_LEVELS > 2 +#define __pmd_free_tlb(tlb, pmdp, addr) \ +do { \ + struct page *page = virt_to_page(pmdp); \ + pgtable_pmd_page_dtor(page); \ + tlb_remove_page((tlb), page); \ +} while (0); +#endif + static inline void check_pgt_cache(void) { quicklist_trim(QUICK_PT, NULL, 25, 16); diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 77abe192fb43..bc77f3dd4261 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -11,133 +11,8 @@ #ifdef CONFIG_MMU #include <linux/swap.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -/* - * TLB handling. This allows us to remove pages from the page - * tables, and efficiently handle the TLB issues. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int fullmm; - unsigned long start, end; -}; - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - - init_tlb_gather(tlb); -} - -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (tlb->fullmm || force) - flush_tlb_mm(tlb->mm); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -static inline void -tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -/* - * In the case of tlb vma handling, we can optimise these away in the - * case where we're doing a full MM flush. When we're doing a munmap, - * the vmas are adjusted to only cover the region to be torn down. - */ -static inline void -tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) - flush_cache_range(vma, vma->vm_start, vma->vm_end); -} - -static inline void -tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm && tlb->end) { - flush_tlb_range(vma, tlb->start, tlb->end); - init_tlb_gather(tlb); - } -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ -} - -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) -#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) - -#define tlb_migrate_finish(mm) do { } while (0) +#include <asm-generic/tlb.h> #if defined(CONFIG_CPU_SH4) || defined(CONFIG_SUPERH64) extern void tlb_wire_entry(struct vm_area_struct *, unsigned long, pte_t); @@ -157,11 +32,6 @@ static inline void tlb_unwire_entry(void) #else /* CONFIG_MMU */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) -#define tlb_flush(tlb) do { } while (0) - #include <asm-generic/tlb.h> #endif /* CONFIG_MMU */ diff --git a/arch/sh/kernel/stacktrace.c b/arch/sh/kernel/stacktrace.c index f3cb2cccb262..2950b19ad077 100644 --- a/arch/sh/kernel/stacktrace.c +++ b/arch/sh/kernel/stacktrace.c @@ -49,8 +49,6 @@ void save_stack_trace(struct stack_trace *trace) unsigned long *sp = (unsigned long *)current_stack_pointer; unwind_stack(current, NULL, sp, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -84,7 +82,5 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) unsigned long *sp = (unsigned long *)tsk->thread.sp; unwind_stack(current, NULL, sp, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index bfda678576e4..480b057556ee 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -426,3 +426,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 16b620237816..f6421c9ce5d3 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -63,6 +63,7 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE diff --git a/arch/sparc/include/asm/tlb_32.h b/arch/sparc/include/asm/tlb_32.h index 343cea19e573..5cd28a8793e3 100644 --- a/arch/sparc/include/asm/tlb_32.h +++ b/arch/sparc/include/asm/tlb_32.h @@ -2,24 +2,6 @@ #ifndef _SPARC_TLB_H #define _SPARC_TLB_H -#define tlb_start_vma(tlb, vma) \ -do { \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define tlb_end_vma(tlb, vma) \ -do { \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, address) \ - do { } while (0) - -#define tlb_flush(tlb) \ -do { \ - flush_tlb_mm((tlb)->mm); \ -} while (0) - #include <asm-generic/tlb.h> #endif /* _SPARC_TLB_H */ diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index a8af6023c126..14b93c5564e3 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -73,6 +73,11 @@ static inline void iommu_batch_start(struct device *dev, unsigned long prot, uns p->npages = 0; } +static inline bool iommu_use_atu(struct iommu *iommu, u64 mask) +{ + return iommu->atu && mask > DMA_BIT_MASK(32); +} + /* Interrupts must be disabled. */ static long iommu_batch_flush(struct iommu_batch *p, u64 mask) { @@ -92,7 +97,7 @@ static long iommu_batch_flush(struct iommu_batch *p, u64 mask) prot &= (HV_PCI_MAP_ATTR_READ | HV_PCI_MAP_ATTR_WRITE); while (npages != 0) { - if (mask <= DMA_BIT_MASK(32) || !pbm->iommu->atu) { + if (!iommu_use_atu(pbm->iommu, mask)) { num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry), npages, @@ -179,7 +184,6 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, unsigned long flags, order, first_page, npages, n; unsigned long prot = 0; struct iommu *iommu; - struct atu *atu; struct iommu_map_table *tbl; struct page *page; void *ret; @@ -205,13 +209,11 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, memset((char *)first_page, 0, PAGE_SIZE << order); iommu = dev->archdata.iommu; - atu = iommu->atu; - mask = dev->coherent_dma_mask; - if (mask <= DMA_BIT_MASK(32) || !atu) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else - tbl = &atu->tbl; + tbl = &iommu->atu->tbl; entry = iommu_tbl_range_alloc(dev, tbl, npages, NULL, (unsigned long)(-1), 0); @@ -333,7 +335,7 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu, atu = iommu->atu; devhandle = pbm->devhandle; - if (dvma <= DMA_BIT_MASK(32)) { + if (!iommu_use_atu(iommu, dvma)) { tbl = &iommu->tbl; iotsb_num = 0; /* we don't care for legacy iommu */ } else { @@ -374,7 +376,7 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page, npages >>= IO_PAGE_SHIFT; mask = *dev->dma_mask; - if (mask <= DMA_BIT_MASK(32)) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else tbl = &atu->tbl; @@ -510,7 +512,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, IO_PAGE_SIZE) >> IO_PAGE_SHIFT; mask = *dev->dma_mask; - if (mask <= DMA_BIT_MASK(32)) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else tbl = &atu->tbl; diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b9a5a04b2d2c..a1dd24307b00 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -469,3 +469,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index dce6db147f24..70ee60383900 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,162 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <asm/percpu.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - -/* struct mmu_gather is an opaque type used by the mm code for passing around - * any data needed by arch specific code for tlb_remove_page. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int need_flush; /* Really unmapped some ptes? */ - unsigned long start; - unsigned long end; - unsigned int fullmm; /* non-zero means full mm flush */ -}; - -static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, - unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->need_flush = 0; - - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - - init_tlb_gather(tlb); -} - -extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end); - -static inline void -tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end); -} - -static inline void -tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - init_tlb_gather(tlb); -} - -static inline void -tlb_flush_mmu(struct mmu_gather *tlb) -{ - if (!tlb->need_flush) - return; - - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -/* arch_tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->start = start; - tlb->end = end; - tlb->need_flush = 1; - } - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -/* tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), - * while handling the additional races in SMP caused by other CPUs - * caching valid mappings in their TLBs. - */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -/** - * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. - * - * Record the fact that pte's were really umapped in ->need_flush, so we can - * later optimise away the tlb invalidate. This helps when userspace is - * unmapping already-unmapped pages, which happens quite a lot. - */ -#define tlb_remove_tlb_entry(tlb, ptep, address) \ - do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, address); \ - } while (0) - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) - -#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) - -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) - -#define tlb_migrate_finish(mm) do {} while (0) +#include <asm-generic/cacheflush.h> +#include <asm-generic/tlb.h> #endif diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c index ebe7bcf62684..bd95e020d509 100644 --- a/arch/um/kernel/stacktrace.c +++ b/arch/um/kernel/stacktrace.c @@ -63,8 +63,6 @@ static const struct stacktrace_ops dump_ops = { static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, &dump_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 0d5869c9bf03..2445dfcf6444 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -20,6 +20,7 @@ config UNICORE32 select GENERIC_IOMAP select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE + select MMU_GATHER_NO_RANGE if MMU help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 9cca15cdae94..00a8477333f6 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -12,10 +12,9 @@ #ifndef __UNICORE_TLB_H__ #define __UNICORE_TLB_H__ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) +/* + * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm(). + */ #define __pte_free_tlb(tlb, pte, addr) \ do { \ diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c index 9976e767d51c..e37da8c6837b 100644 --- a/arch/unicore32/kernel/stacktrace.c +++ b/arch/unicore32/kernel/stacktrace.c @@ -120,8 +120,6 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7d160f58a8f6..90e2640ade75 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -74,6 +74,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_STACKWALK select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 @@ -183,7 +184,6 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if PARAVIRT - select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API @@ -1488,7 +1488,7 @@ config X86_CPA_STATISTICS depends on DEBUG_FS ---help--- Expose statistics about the Change Page Attribute mechanims, which - helps to determine the effectivness of preserving large and huge + helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. config ARCH_HAS_MEM_ENCRYPT diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index c0d6c560df69..5a237e8dbf8d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -352,7 +352,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, boot_params->hdr.loadflags &= ~KASLR_FLAG; /* Save RSDP address for later use. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + /* boot_params->acpi_rsdp_addr = get_rsdp_addr(); */ sanitize_boot_params(boot_params); diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 3b6e70d085da..8457cdd47f75 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2) vpaddq t2,t1,t1 vmovq t1x,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index e6add74d78a5..6f0be7a86964 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx @@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2) paddq t2,t1 movq t1,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d309f30cf7af..5fc76b755510 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -650,6 +650,7 @@ ENTRY(__switch_to_asm) pushl %ebx pushl %edi pushl %esi + pushfl /* switch stack */ movl %esp, TASK_threadsp(%eax) @@ -672,6 +673,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfl popl %esi popl %edi popl %ebx diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 007b3fe9d727..98c7d12b945c 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -29,12 +29,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page +extern u8 pvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif #ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page +extern u8 hvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 7d2d7c801dba..f15441b07dad 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -3,10 +3,14 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/delay.h> #include <asm/apicdef.h> +#include <asm/nmi.h> #include "../perf_event.h" +static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -112,23 +116,144 @@ static __initconst const u64 amd_hw_cache_event_ids }, }; +static __initconst const u64 amd_hw_cache_event_ids_f17h + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */ + [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */ + [C(RESULT_MISS)] = 0, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */ + [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */ + [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */ + [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */ + [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + /* - * AMD Performance Monitor K7 and later. + * AMD Performance Monitor K7 and later, up to and including Family 16h: */ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, - [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; static u64 amd_pmu_event_map(int hw_event) { + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + return amd_perfmon_event_map[hw_event]; } @@ -429,6 +554,132 @@ static void amd_pmu_cpu_dead(int cpu) } } +/* + * When a PMC counter overflows, an NMI is used to process the event and + * reset the counter. NMI latency can result in the counter being updated + * before the NMI can run, which can result in what appear to be spurious + * NMIs. This function is intended to wait for the NMI to run and reset + * the counter to avoid possible unhandled NMI messages. + */ +#define OVERFLOW_WAIT_COUNT 50 + +static void amd_pmu_wait_on_overflow(int idx) +{ + unsigned int i; + u64 counter; + + /* + * Wait for the counter to be reset if it has overflowed. This loop + * should exit very, very quickly, but just in case, don't wait + * forever... + */ + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { + rdmsrl(x86_pmu_event_addr(idx), counter); + if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + break; + + /* Might be in IRQ context, so can't sleep */ + udelay(1); + } +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + x86_pmu_disable_all(); + + /* + * This shouldn't be called from NMI context, but add a safeguard here + * to return, since if we're in NMI context we can't wait for an NMI + * to reset an overflowed counter value. + */ + if (in_nmi()) + return; + + /* + * Check each counter for overflow and wait for it to be reset by the + * NMI if it has overflowed. This relies on the fact that all active + * counters are always enabled when this function is caled and + * ARCH_PERFMON_EVENTSEL_INT is always set. + */ + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_wait_on_overflow(idx); + } +} + +static void amd_pmu_disable_event(struct perf_event *event) +{ + x86_pmu_disable_event(event); + + /* + * This can be called from NMI context (via x86_pmu_stop). The counter + * may have overflowed, but either way, we'll never see it get reset + * by the NMI if we're already in the NMI. And the NMI latency support + * below will take care of any pending NMI that might have been + * generated by the overflow. + */ + if (in_nmi()) + return; + + amd_pmu_wait_on_overflow(event->hw.idx); +} + +/* + * Because of NMI latency, if multiple PMC counters are active or other sources + * of NMIs are received, the perf NMI handler can handle one or more overflowed + * PMC counters outside of the NMI associated with the PMC overflow. If the NMI + * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel + * back-to-back NMI support won't be active. This PMC handler needs to take into + * account that this can occur, otherwise this could result in unknown NMI + * messages being issued. Examples of this is PMC overflow while in the NMI + * handler when multiple PMCs are active or PMC overflow while handling some + * other source of an NMI. + * + * Attempt to mitigate this by using the number of active PMCs to determine + * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset + * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the + * number of active PMCs or 2. The value of 2 is used in case an NMI does not + * arrive at the LAPIC in time to be collapsed into an already pending NMI. + */ +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int active, handled; + + /* + * Obtain the active count before calling x86_pmu_handle_irq() since + * it is possible that x86_pmu_handle_irq() may make a counter + * inactive (through x86_pmu_stop). + */ + active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + /* + * If a counter was handled, record the number of possible remaining + * NMIs that can occur. + */ + if (handled) { + this_cpu_write(perf_nmi_counter, + min_t(unsigned int, 2, active)); + + return handled; + } + + if (!this_cpu_read(perf_nmi_counter)) + return NMI_DONE; + + this_cpu_dec(perf_nmi_counter); + + return NMI_HANDLED; +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -621,11 +872,11 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, .enable_all = x86_pmu_enable_all, .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, + .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, @@ -718,9 +969,10 @@ __init int amd_pmu_init(void) x86_pmu.amd_nb_constraints = 0; } - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (boot_cpu_data.x86 >= 0x17) + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids)); + else + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); return 0; } @@ -732,7 +984,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -750,7 +1002,7 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e2b1447192a8..81911e11a15d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1349,8 +1349,9 @@ void x86_pmu_stop(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + if (test_bit(hwc->idx, cpuc->active_mask)) { x86_pmu.disable(event); + __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; @@ -1447,16 +1448,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs) apic_write(APIC_LVTPC, APIC_DM_NMI); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) { - /* - * Though we deactivated the counter some cpus - * might still deliver spurious interrupts still - * in flight. Catch them: - */ - if (__test_and_clear_bit(idx, cpuc->running)) - handled++; + if (!test_bit(idx, cpuc->active_mask)) continue; - } event = cpuc->events[idx]; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8baa441d8000..d35f4775d5f1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2091,15 +2091,19 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; } x86_pmu_disable_event(event); + + /* + * Needs to be called after x86_pmu_disable_event, + * so we don't trigger the event without PEBS bit set. + */ + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); } static void intel_pmu_del_event(struct perf_event *event) @@ -3131,7 +3135,7 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) flags &= ~PERF_SAMPLE_TIME; if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; - if (event->attr.sample_regs_user & ~PEBS_REGS) + if (event->attr.sample_regs_user & ~PEBS_GP_REGS) flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } @@ -3185,7 +3189,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { - if (!event->attr.freq) { + if (!(event->attr.freq || event->attr.wakeup_events)) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event))) @@ -3575,6 +3579,12 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + if (x86_pmu.flags & PMU_FL_TFA) { + WARN_ON_ONCE(cpuc->tfa_shadow); + cpuc->tfa_shadow = ~0ULL; + intel_set_tfa(cpuc, false); + } + if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 94a4b7fc75d0..d41de9af7a39 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -76,15 +76,15 @@ * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL * Scope: Package (physical package) * */ @@ -566,8 +566,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fb3a2f13fc70..339d7628080c 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1525,8 +1525,7 @@ static __init int pt_init(void) } if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) - pt_pmu.pmu.capabilities = - PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; pt_pmu.pmu.attr_groups = pt_attr_groups; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a75955741c50..1e98a42b560a 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -96,25 +96,25 @@ struct amd_nb { PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ PERF_SAMPLE_PERIOD) -#define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ - PERF_REG_X86_CX | \ - PERF_REG_X86_DX | \ - PERF_REG_X86_DI | \ - PERF_REG_X86_SI | \ - PERF_REG_X86_SP | \ - PERF_REG_X86_BP | \ - PERF_REG_X86_IP | \ - PERF_REG_X86_FLAGS | \ - PERF_REG_X86_R8 | \ - PERF_REG_X86_R9 | \ - PERF_REG_X86_R10 | \ - PERF_REG_X86_R11 | \ - PERF_REG_X86_R12 | \ - PERF_REG_X86_R13 | \ - PERF_REG_X86_R14 | \ - PERF_REG_X86_R15) +#define PEBS_GP_REGS \ + ((1ULL << PERF_REG_X86_AX) | \ + (1ULL << PERF_REG_X86_BX) | \ + (1ULL << PERF_REG_X86_CX) | \ + (1ULL << PERF_REG_X86_DX) | \ + (1ULL << PERF_REG_X86_DI) | \ + (1ULL << PERF_REG_X86_SI) | \ + (1ULL << PERF_REG_X86_SP) | \ + (1ULL << PERF_REG_X86_BP) | \ + (1ULL << PERF_REG_X86_IP) | \ + (1ULL << PERF_REG_X86_FLAGS) | \ + (1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) /* * Per register state. diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 321fe5f5d0e9..4d5fcd47ab75 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -61,9 +61,8 @@ } while (0) #define RELOAD_SEG(seg) { \ - unsigned int pre = GET_SEG(seg); \ + unsigned int pre = (seg) | 3; \ unsigned int cur = get_user_seg(seg); \ - pre |= 3; \ if (pre != cur) \ set_user_seg(seg, pre); \ } @@ -72,6 +71,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; + u16 gs, fs, es, ds; void __user *buf; u32 tmp; @@ -79,16 +79,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; get_user_try { - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ - RELOAD_SEG(gs); - RELOAD_SEG(fs); - RELOAD_SEG(ds); - RELOAD_SEG(es); + gs = GET_SEG(gs); + fs = GET_SEG(fs); + ds = GET_SEG(ds); + es = GET_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(ax); @@ -106,6 +100,17 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + RELOAD_SEG(gs); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); + err |= fpu__restore_sig(buf, 1); force_iret(); diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 31b627b43a8e..464034db299f 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -20,6 +20,17 @@ #endif /* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + +/* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains * enough information for the alternatives patching code to patch an diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4c74073a19cc..094fbc9c0b1c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -45,6 +45,16 @@ #define LOCK_PREFIX "" #endif +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.ignore_alts\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 6467757bb39f..3ff577c0b102 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -148,30 +148,6 @@ _ASM_PTR (entry); \ .popsection -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE_UA(100b, 103b) - _ASM_EXTABLE_UA(101b, 103b) - .endm - #else # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index d153d570bb04..8e790ec219a5 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -36,16 +36,17 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ -#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#define RLONG_ADDR(x) "m" (*(volatile long *) (x)) +#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x)) -#define ADDR BITOP_ADDR(addr) +#define ADDR RLONG_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ #define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) -#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) /** @@ -73,7 +74,7 @@ set_bit(long nr, volatile unsigned long *addr) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" - : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -88,7 +89,7 @@ set_bit(long nr, volatile unsigned long *addr) */ static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory"); + asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } /** @@ -110,8 +111,7 @@ clear_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" - : BITOP_ADDR(addr) - : "Ir" (nr)); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -131,7 +131,7 @@ static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *ad static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) @@ -139,7 +139,7 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile bool negative; asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) - : CC_OUT(s) (negative), ADDR + : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "ir" ((char) ~(1 << nr)) : "memory"); return negative; } @@ -155,13 +155,9 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile * __clear_bit() is non-atomic and implies release semantics before the memory * operation. It can be used for an unlock if no other CPUs can concurrently * modify other bits in the word. - * - * No memory barrier is required here, because x86 cannot reorder stores past - * older loads. Same principle as spin_unlock. */ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { - barrier(); __clear_bit(nr, addr); } @@ -176,7 +172,7 @@ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long * */ static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } /** @@ -196,8 +192,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" - : BITOP_ADDR(addr) - : "Ir" (nr)); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -242,8 +237,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -282,8 +277,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -294,8 +289,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr) : "memory"); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -326,7 +321,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long *)addr), "Ir" (nr)); + : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); return oldbit; } diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 93c4bf598fb0..feab24cac610 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -226,7 +226,9 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); - int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, + const char *smstate); + void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 159b5988292f..c79abe7ca093 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) } #define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 @@ -295,6 +295,7 @@ union kvm_mmu_extended_role { unsigned int valid:1; unsigned int execonly:1; unsigned int cr0_pg:1; + unsigned int cr4_pae:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; @@ -844,9 +845,9 @@ enum kvm_irqchip_mode { }; struct kvm_arch { - unsigned int n_used_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_max_mmu_pages; + unsigned long n_used_mmu_pages; + unsigned long n_requested_mmu_pages; + unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -1182,7 +1183,7 @@ struct kvm_x86_ops { int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); int (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); @@ -1256,8 +1257,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); bool pdptrs_changed(struct kvm_vcpu *vcpu); @@ -1592,4 +1593,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val +#define GET_SMSTATE(type, buf, offset) \ + (*(type *)((buf) + (offset) - 0x7e00)) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dad12b767ba0..daf25b60c9e3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,6 +11,15 @@ #include <asm/msr-index.h> /* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + ANNOTATE_IGNORE_ALTERNATIVE + +/* * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an @@ -57,19 +66,6 @@ #ifdef __ASSEMBLY__ /* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -.macro ANNOTATE_NOSPEC_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.nospec - .long .Lannotate_\@ - . - .popsection -.endm - -/* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline * builds. @@ -152,12 +148,6 @@ #else /* __ASSEMBLY__ */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - "999:\n\t" \ - ".pushsection .discard.nospec\n\t" \ - ".long 999b - .\n\t" \ - ".popsection\n\t" - #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2779ace16d23..50b3e2d963c9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -46,7 +46,7 @@ void ptdump_walk_user_pgd_level_checkwx(void); */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __visible; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; extern struct list_head pgd_list; diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index db333300bd4b..f94a7d0ddd49 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,13 +13,12 @@ #ifndef _ASM_X86_SMAP_H #define _ASM_X86_SMAP_H -#include <linux/stringify.h> #include <asm/nops.h> #include <asm/cpufeatures.h> /* "Raw" instruction opcodes */ -#define __ASM_CLAC .byte 0x0f,0x01,0xca -#define __ASM_STAC .byte 0x0f,0x01,0xcb +#define __ASM_CLAC ".byte 0x0f,0x01,0xca" +#define __ASM_STAC ".byte 0x0f,0x01,0xcb" #ifdef __ASSEMBLY__ @@ -28,10 +27,10 @@ #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ - ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -49,26 +48,46 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __ASM_CLAC, X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __ASM_STAC, X86_FEATURE_SMAP); +} + +static __always_inline unsigned long smap_save(void) +{ + unsigned long flags; + + asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, + X86_FEATURE_SMAP) + : "=rm" (flags) : : "memory", "cc"); + + return flags; +} + +static __always_inline void smap_restore(unsigned long flags) +{ + asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + : : "g" (flags) : "memory", "cc"); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ static inline void clac(void) { } static inline void stac(void) { } +static inline unsigned long smap_save(void) { return 0; } +static inline void smap_restore(unsigned long flags) { } + #define ASM_CLAC #define ASM_STAC diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7cf1a270d891..18a4b6890fa8 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -46,6 +46,7 @@ struct inactive_task_frame { unsigned long r13; unsigned long r12; #else + unsigned long flags; unsigned long si; unsigned long di; #endif diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 404b8b1d44f5..f23e7aaff4cd 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,7 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +#define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1954dd5552a2..bb21913885a3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -427,10 +427,11 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ - __typeof__(*(ptr)) __pu_val; \ - __pu_val = x; \ + __typeof__(*(ptr)) __pu_val = (x); \ + __typeof__(ptr) __pu_ptr = (ptr); \ + __typeof__(size) __pu_size = (size); \ __uaccess_begin(); \ - __put_user_size(__pu_val, (ptr), (size), __pu_label); \ + __put_user_size(__pu_val, __pu_ptr, __pu_size, __pu_label); \ __pu_err = 0; \ __pu_label: \ __uaccess_end(); \ @@ -705,7 +706,7 @@ extern struct movsl_mask { * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ -static __must_check inline bool user_access_begin(const void __user *ptr, size_t len) +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; @@ -715,6 +716,9 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() +#define user_access_save() smap_save() +#define user_access_restore(x) smap_restore(x) + #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index a9d637bc301d..5cd1caa8bc65 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -208,9 +208,6 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) } unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len); - -unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 2863c2026655..d50c7b747d8b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -217,6 +217,22 @@ xen_single_call(unsigned int call, return (long)__res; } +static __always_inline void __xen_stac(void) +{ + /* + * Suppress objtool seeing the STAC/CLAC and getting confused about it + * calling random code with AC=1. + */ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_STAC ::: "memory", "flags"); +} + +static __always_inline void __xen_clac(void) +{ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_CLAC ::: "memory", "flags"); +} + static inline long privcmd_call(unsigned int call, unsigned long a1, unsigned long a2, @@ -225,9 +241,9 @@ privcmd_call(unsigned int call, { long res; - stac(); + __xen_stac(); res = xen_single_call(call, a1, a2, a3, a4, a5); - clac(); + __xen_clac(); return res; } @@ -424,9 +440,9 @@ HYPERVISOR_dm_op( domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { int ret; - stac(); + __xen_stac(); ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); - clac(); + __xen_clac(); return ret; } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index dabfcf7c3941..7a0e64ccd6ff 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -381,6 +381,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398..d213ec5c3766 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -146,6 +146,7 @@ #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 +#define VMX_ABORT_VMCS_CORRUPTED 3 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2da82eff0eb4..29630393f300 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -275,7 +275,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -419,7 +419,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -440,7 +440,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -658,7 +659,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ @@ -672,7 +673,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -1008,6 +1010,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..3142fd7a9b32 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -611,8 +611,8 @@ static void init_intel_energy_perf(struct cpuinfo_x86 *c) if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) return; - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); + pr_info_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + pr_info_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 399601eda8e4..85212a32b54d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2039,14 +2039,14 @@ out: enum rdt_param { Opt_cdp, Opt_cdpl2, - Opt_mba_mpbs, + Opt_mba_mbps, nr__rdt_params }; static const struct fs_parameter_spec rdt_param_specs[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_mpbs", Opt_mba_mpbs), + fsparam_flag("mba_MBps", Opt_mba_mbps), {} }; @@ -2072,7 +2072,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_cdpl2: ctx->enable_cdpl2 = true; return 0; - case Opt_mba_mpbs: + case Opt_mba_mbps: if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -EINVAL; ctx->enable_mba_mbps = true; @@ -2610,9 +2610,10 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } - rdtgrp->mode = RDT_MODE_SHAREABLE; } + rdtgrp->mode = RDT_MODE_SHAREABLE; + return 0; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7e..fed46ddb1eef 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -569,6 +569,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -748,26 +749,48 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +static struct kprobe kretprobe_kprobe = { + .addr = (void *)kretprobe_trampoline, +}; + /* * Called from kretprobe_trampoline */ static __used void *trampoline_handler(struct pt_regs *regs) { + struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; + + preempt_disable(); + + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kcb = get_kprobe_ctlblk(); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -789,8 +812,25 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -808,14 +848,15 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); } recycle_rp_inst(ri, &empty_rp); @@ -831,6 +872,9 @@ static __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a..957eae13b370 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -426,6 +426,8 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; + lockdep_assert_irqs_disabled(); + /* * If TIF_SSBD is different, select the proper mitigation * method. Note that if SSBD mitigation is disabled or permanentely @@ -477,10 +479,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b2..70933193878c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf..844a28b29967 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c0..8fd3cedd9acc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd9..3773905cd2c1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1005,13 +1005,11 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_init(); - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine(), for the boot CPU. + * needs to be done after dmi_setup(), for the boot CPU. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..dff90fb6a9af 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -132,16 +132,6 @@ static int restore_sigcontext(struct pt_regs *regs, COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && - user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ @@ -150,6 +140,15 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -461,6 +460,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *fp = NULL; + unsigned long uc_flags; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); @@ -473,9 +473,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -541,6 +543,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, { #ifdef CONFIG_X86_X32_ABI struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -555,9 +558,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -688,10 +693,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; - /* - * Increment event counter and perform fixup for the pre-signal - * frame. - */ + /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 5c2d71a1dc06..2abf27d7df6b 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -12,78 +12,31 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -static int save_stack_address(struct stack_trace *trace, unsigned long addr, - bool nosched) -{ - if (nosched && in_sched_functions(addr)) - return 0; - - if (trace->skip > 0) { - trace->skip--; - return 0; - } - - if (trace->nr_entries >= trace->max_entries) - return -1; - - trace->entries[trace->nr_entries++] = addr; - return 0; -} - -static void noinline __save_stack_trace(struct stack_trace *trace, - struct task_struct *task, struct pt_regs *regs, - bool nosched) +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; - if (regs) - save_stack_address(trace, regs->ip, nosched); + if (regs && !consume_entry(cookie, regs->ip, false)) + return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || save_stack_address(trace, addr, nosched)) + if (!addr || !consume_entry(cookie, addr, false)) break; } - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } /* - * Save stack-backtrace addresses into a stack_trace buffer. + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. */ -void save_stack_trace(struct stack_trace *trace) -{ - trace->skip++; - __save_stack_trace(trace, current, NULL, false); -} -EXPORT_SYMBOL_GPL(save_stack_trace); - -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - __save_stack_trace(trace, current, regs, false); -} - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) - trace->skip++; - __save_stack_trace(trace, tsk, NULL, true); - - put_task_stack(tsk); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE - -static int __always_inline -__save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; @@ -97,7 +50,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (regs) { /* Success path for user tasks */ if (user_mode(regs)) - goto success; + return 0; /* * Kernel mode registers on the stack indicate an @@ -120,7 +73,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!addr) return -EINVAL; - if (save_stack_address(trace, addr, false)) + if (!consume_entry(cookie, addr, false)) return -EINVAL; } @@ -132,39 +85,9 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; -success: - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; - return 0; } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ -int save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - int ret; - - /* - * If the task doesn't have a stack (e.g., a zombie), the stack is - * "reliably" empty. - */ - if (!try_get_task_stack(tsk)) - return 0; - - ret = __save_stack_trace_reliable(trace, tsk); - - put_task_stack(tsk); - - return ret; -} -#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ - /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -189,15 +112,15 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) return ret; } -static inline void __save_stack_trace_user(struct stack_trace *trace) +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) { - const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = regs->ip; + if (!consume_entry(cookie, regs->ip, false)) + return; - while (trace->nr_entries < trace->max_entries) { + while (1) { struct stack_frame_user frame; frame.next_fp = NULL; @@ -207,8 +130,8 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { - trace->entries[trace->nr_entries++] = - frame.ret_addr; + if (!consume_entry(cookie, frame.ret_addr, false)) + return; } if (fp == frame.next_fp) break; @@ -216,14 +139,3 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) } } -void save_stack_trace_user(struct stack_trace *trace) -{ - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm) { - __save_stack_trace_user(trace); - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; -} diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bad8c51fee6e..a5127b2c195f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -362,7 +362,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) - *(.bss) + *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c338984c850d..d0d5dd44b4f4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2331,24 +2331,18 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) { +#ifdef CONFIG_X86_64 u32 eax, ebx, ecx, edx; eax = 0x80000001; ecx = 0; ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); return edx & bit(X86_FEATURE_LM); +#else + return false; +#endif } -#define GET_SMSTATE(type, smbase, offset) \ - ({ \ - type __val; \ - int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ - sizeof(__val)); \ - if (r != X86EMUL_CONTINUE) \ - return X86EMUL_UNHANDLEABLE; \ - __val; \ - }) - static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) { desc->g = (flags >> 23) & 1; @@ -2361,27 +2355,30 @@ static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) desc->type = (flags >> 8) & 15; } -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) { struct desc_struct desc; int offset; u16 selector; - selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); return X86EMUL_CONTINUE; } -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +#ifdef CONFIG_X86_64 +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) { struct desc_struct desc; int offset; @@ -2390,15 +2387,16 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) offset = 0x7e00 + n * 16; - selector = GET_SMSTATE(u16, smbase, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - base3 = GET_SMSTATE(u32, smbase, offset + 12); + selector = GET_SMSTATE(u16, smstate, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + base3 = GET_SMSTATE(u32, smstate, offset + 12); ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); return X86EMUL_CONTINUE; } +#endif static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, u64 cr0, u64 cr3, u64 cr4) @@ -2445,7 +2443,8 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, + const char *smstate) { struct desc_struct desc; struct desc_ptr dt; @@ -2453,53 +2452,55 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) u32 val, cr0, cr3, cr4; int i; - cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); + cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - val = GET_SMSTATE(u32, smbase, 0x7fcc); + val = GET_SMSTATE(u32, smstate, 0x7fcc); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7fc8); + val = GET_SMSTATE(u32, smstate, 0x7fc8); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - selector = GET_SMSTATE(u32, smbase, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + selector = GET_SMSTATE(u32, smstate, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - selector = GET_SMSTATE(u32, smbase, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + selector = GET_SMSTATE(u32, smstate, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - dt.address = GET_SMSTATE(u32, smbase, 0x7f74); - dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + dt.address = GET_SMSTATE(u32, smstate, 0x7f74); + dt.size = GET_SMSTATE(u32, smstate, 0x7f70); ctxt->ops->set_gdt(ctxt, &dt); - dt.address = GET_SMSTATE(u32, smbase, 0x7f58); - dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + dt.address = GET_SMSTATE(u32, smstate, 0x7f58); + dt.size = GET_SMSTATE(u32, smstate, 0x7f54); ctxt->ops->set_idt(ctxt, &dt); for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smbase, i); + int r = rsm_load_seg_32(ctxt, smstate, i); if (r != X86EMUL_CONTINUE) return r; } - cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } -static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) +#ifdef CONFIG_X86_64 +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, + const char *smstate) { struct desc_struct desc; struct desc_ptr dt; @@ -2509,43 +2510,43 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) int i, r; for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smbase, 0x7f68); + val = GET_SMSTATE(u32, smstate, 0x7f68); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7f60); + val = GET_SMSTATE(u32, smstate, 0x7f60); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - cr3 = GET_SMSTATE(u64, smbase, 0x7f50); - cr4 = GET_SMSTATE(u64, smbase, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); - val = GET_SMSTATE(u64, smbase, 0x7ed0); + cr0 = GET_SMSTATE(u64, smstate, 0x7f58); + cr3 = GET_SMSTATE(u64, smstate, 0x7f50); + cr4 = GET_SMSTATE(u64, smstate, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); + val = GET_SMSTATE(u64, smstate, 0x7ed0); ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); - selector = GET_SMSTATE(u32, smbase, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); - base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + selector = GET_SMSTATE(u32, smstate, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); + base3 = GET_SMSTATE(u32, smstate, 0x7e9c); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e84); - dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + dt.size = GET_SMSTATE(u32, smstate, 0x7e84); + dt.address = GET_SMSTATE(u64, smstate, 0x7e88); ctxt->ops->set_idt(ctxt, &dt); - selector = GET_SMSTATE(u32, smbase, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); - base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + selector = GET_SMSTATE(u32, smstate, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); + base3 = GET_SMSTATE(u32, smstate, 0x7e7c); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e64); - dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + dt.size = GET_SMSTATE(u32, smstate, 0x7e64); + dt.address = GET_SMSTATE(u64, smstate, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); @@ -2553,37 +2554,49 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) return r; for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smbase, i); + r = rsm_load_seg_64(ctxt, smstate, i); if (r != X86EMUL_CONTINUE) return r; } return X86EMUL_CONTINUE; } +#endif static int em_rsm(struct x86_emulate_ctxt *ctxt) { unsigned long cr0, cr4, efer; + char buf[512]; u64 smbase; int ret; if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); + smbase = ctxt->ops->get_smbase(ctxt); + + ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); + if (ret != X86EMUL_CONTINUE) + return X86EMUL_UNHANDLEABLE; + + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); + /* * Get back to real mode, to prepare a safe state in which to load * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU * supports long mode. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); if (emulator_has_longmode(ctxt)) { struct desc_struct cs_desc; /* Zero CR4.PCIDE before CR0.PG. */ - if (cr4 & X86_CR4_PCIDE) { + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PCIDE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - cr4 &= ~X86_CR4_PCIDE; - } /* A 32-bit code segment is required to clear EFER.LMA. */ memset(&cs_desc, 0, sizeof(cs_desc)); @@ -2597,39 +2610,39 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + if (emulator_has_longmode(ctxt)) { + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - smbase = ctxt->ops->get_smbase(ctxt); + /* And finally go back to 32-bit mode. */ + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + } /* * Give pre_leave_smm() a chance to make ISA-specific changes to the * vCPU state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (ctxt->ops->pre_leave_smm(ctxt, smbase)) + if (ctxt->ops->pre_leave_smm(ctxt, buf)) return X86EMUL_UNHANDLEABLE; +#ifdef CONFIG_X86_64 if (emulator_has_longmode(ctxt)) - ret = rsm_load_state_64(ctxt, smbase + 0x8000); + ret = rsm_load_state_64(ctxt, buf); else - ret = rsm_load_state_32(ctxt, smbase + 0x8000); +#endif + ret = rsm_load_state_32(ctxt, buf); if (ret != X86EMUL_CONTINUE) { /* FIXME: should triple fault */ return X86EMUL_UNHANDLEABLE; } - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); + ctxt->ops->post_leave_smm(ctxt); - ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & - ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 421899f6ad7b..cc24b3a32c44 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; - all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; + + /* + * Work around possible WS2012 bug: it sends hypercalls + * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, + * while also expecting us to flush something and crashing if + * we don't. Let's treat processor_mask == 0 same as + * HV_FLUSH_ALL_PROCESSORS. + */ + all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || + flush.processor_mask == 0; } else { if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, sizeof(flush_ex)))) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 991fdf7fc17f..bd13fdddbdc4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,7 +70,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -static bool lapic_timer_advance_adjust_done = false; #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -138,6 +137,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); + offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { @@ -901,7 +901,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { - *dst = &map->phys_map[irq->dest_id]; + u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); + *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; @@ -1480,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } +static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) +{ + u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; + + /* + * If the guest TSC is running at a different ratio than the host, then + * convert the delay to nanoseconds to achieve an accurate delay. Note + * that __delay() uses delay_tsc whenever the hardware has TSC, thus + * always for VMX enabled hardware. + */ + if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + __delay(min(guest_cycles, + nsec_to_cycles(vcpu, timer_advance_ns))); + } else { + u64 delay_ns = guest_cycles * 1000000ULL; + do_div(delay_ns, vcpu->arch.virtual_tsc_khz); + ndelay(min_t(u32, delay_ns, timer_advance_ns)); + } +} + void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 guest_tsc, tsc_deadline, ns; - if (!lapic_in_kernel(vcpu)) - return; - if (apic->lapic_timer.expired_tscdeadline == 0) return; @@ -1499,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (!lapic_timer_advance_adjust_done) { + if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ if (guest_tsc < tsc_deadline) { ns = (tsc_deadline - guest_tsc) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns -= min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } else { /* too late */ ns = (guest_tsc - tsc_deadline) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns += min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - lapic_timer_advance_adjust_done = true; + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = 0; + apic->lapic_timer.timer_advance_adjust_done = true; + } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; } } static void start_sw_tscdeadline(struct kvm_lapic *apic) { - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; @@ -1540,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); + + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + + if (likely(tscdeadline > guest_tsc) && + likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); + expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -2253,7 +2278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu) +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) { struct kvm_lapic *apic; @@ -2277,6 +2302,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; + if (timer_advance_ns == -1) { + apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_adjust_done = false; + } else { + apic->lapic_timer.timer_advance_ns = timer_advance_ns; + apic->lapic_timer.timer_advance_adjust_done = true; + } + /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff6ef9c3d760..d6d049ba3045 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,8 +31,10 @@ struct kvm_timer { u32 timer_mode_mask; u64 tscdeadline; u64 expired_tscdeadline; + u32 timer_advance_ns; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; + bool timer_advance_adjust_done; }; struct kvm_lapic { @@ -62,7 +64,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu); +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eee455a8a612..d9c7b45d231f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt) * aggregate version in order to make the slab shrinker * faster */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); @@ -2238,7 +2238,7 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, struct list_head *invalid_list, bool remote_flush) { - if (!remote_flush && !list_empty(invalid_list)) + if (!remote_flush && list_empty(invalid_list)) return false; if (!list_empty(invalid_list)) @@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { LIST_HEAD(invalid_list); @@ -4781,6 +4781,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) union kvm_mmu_extended_role ext = {0}; ext.cr0_pg = !!is_paging(vcpu); + ext.cr4_pae = !!is_pae(vcpu); ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); ext.cr4_pse = !!is_pse(vcpu); @@ -6031,10 +6032,10 @@ out: /* * Calculate mmu pages needed for kvm. */ -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) { - unsigned int nr_mmu_pages; - unsigned int nr_pages = 0; + unsigned long nr_mmu_pages; + unsigned long nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; @@ -6047,8 +6048,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bbdc60f2fae8..54c2a377795b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); -static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) return kvm->arch.n_max_mmu_pages - diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 58ead7db71a3..e39741997893 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -281,9 +281,13 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { bool fast_mode = idx & (1u << 31); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; u64 ctr_val; + if (!pmu->version) + return 1; + if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e0a791c3d4fc..406b558abfef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -262,6 +262,7 @@ struct amd_svm_iommu_ir { }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) @@ -2692,6 +2693,7 @@ static int npf_interception(struct vcpu_svm *svm) static int db_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_vcpu *vcpu = &svm->vcpu; if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && @@ -2702,6 +2704,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { disable_nmi_singlestep(svm); + /* Make sure we check for pending NMIs upon entry */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } if (svm->vcpu.guest_debug & @@ -4517,14 +4521,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; struct kvm_lapic *apic = svm->vcpu.arch.apic; /* - * Update ICR high and low, then emulate sending IPI, - * which is handled when writing APIC_ICR. + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } break; } case AVIC_IPI_FAILURE_INVALID_TARGET: @@ -4596,7 +4611,7 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) - WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); + clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) @@ -5621,6 +5636,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; clgi(); + kvm_load_guest_xcr0(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -5766,6 +5782,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); + kvm_put_guest_xcr0(vcpu); stgi(); /* Any pending NMI will happen here */ @@ -6215,32 +6232,24 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *nested_vmcb; struct page *page; - struct { - u64 guest; - u64 vmcb; - } svm_state_save; - int ret; + u64 guest; + u64 vmcb; - ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, - sizeof(svm_state_save)); - if (ret) - return ret; + guest = GET_SMSTATE(u64, smstate, 0x7ed8); + vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); - if (svm_state_save.guest) { - vcpu->arch.hflags &= ~HF_SMM_MASK; - nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); - if (nested_vmcb) - enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); - else - ret = 1; - vcpu->arch.hflags |= HF_SMM_MASK; + if (guest) { + nested_vmcb = nested_svm_map(svm, vmcb, &page); + if (!nested_vmcb) + return 1; + enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); } - return ret; + return 0; } static int enable_smi_window(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6432d08c7de7..4d47a2631d1f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -438,13 +438,13 @@ TRACE_EVENT(kvm_apic_ipi, ); TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), + TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) - __field( __u8, tm ) + __field( __u16, tm ) __field( __u8, vec ) ), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7ec9bb1dd723..0c601d079cd2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2873,20 +2873,27 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - * Failing the vm entry is _not_ what the processor - * does but it's basically the only possibility we - * have. We could still enter the guest if CR8 load - * exits are enabled, CR8 store exits are enabled, and - * virtualize APIC access is disabled; in this case - * the processor would never use the TPR shadow and we - * could simply clear the bit from the execution - * control. But such a configuration is useless, so - * let's keep the code simple. */ if (!is_error_page(page)) { vmx->nested.virtual_apic_page = page; hpa = page_to_phys(vmx->nested.virtual_apic_page); vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && + nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * The processor will never use the TPR shadow, simply + * clear the bit from the execution control. Such a + * configuration is useless, but it happens in tests. + * For any other configuration, failing the vm entry is + * _not_ what the processor does but it's basically the + * only possibility we have. + */ + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_TPR_SHADOW); + } else { + printk("bad virtual-APIC page address\n"); + dump_vmcs(); } } @@ -3789,8 +3796,18 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 + * points to shadow pages! Fortunately we only get here after a WARN_ON + * if EPT is disabled, so a VMabort is perfectly fine. + */ + if (enable_ept) { + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } else { + nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); + } /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -5406,7 +5423,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return ret; /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) return 0; if (kvm_state->vmx.vmcs_pa != -1ull) { @@ -5450,7 +5467,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer != -1ull) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12)) return -EINVAL; if (copy_from_user(shadow_vmcs12, @@ -5738,6 +5755,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + /* + * Without EPT it is not possible to restore L1's CR3 and PDPTR on + * VMfail, because they are not available in vmcs01. Just always + * use hardware checks. + */ + if (!enable_ept) + nested_early_check = 1; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7b272738c576..d4cb1945b2e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> +#include <asm/nospec-branch.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter) * referred to by VMCS.HOST_RIP. */ ENTRY(vmx_vmexit) +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE + /* Preserve guest's RAX, it's used to stuff the RSB. */ + push %_ASM_AX + + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + + pop %_ASM_AX +.Lvmexit_skip_rsb: +#endif ret ENDPROC(vmx_vmexit) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab432a930ae8..0c955bb286ff 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5603,7 +5603,7 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +void dump_vmcs(void) { u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); @@ -6410,6 +6410,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + kvm_load_guest_xcr0(vcpu); + if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && vcpu->arch.pkru != vmx->host_pkru) @@ -6460,9 +6462,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) current_evmcs->hv_clean_fields |= @@ -6506,6 +6505,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } + kvm_put_guest_xcr0(vcpu); + vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -6852,6 +6853,30 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } +static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return false; + + eax.full = entry->eax; + return (eax.split.version_id > 0); +} + +static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool pmu_enabled = guest_cpuid_has_pmu(vcpu); + + if (pmu_enabled) + vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING; + else + vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING; +} + static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6940,6 +6965,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); + nested_vmx_procbased_ctls_update(vcpu); } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && @@ -7003,6 +7029,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; if (kvm_mwait_in_guest(vcpu->kvm)) return -EOPNOTSUPP; @@ -7011,7 +7038,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, + ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; @@ -7369,7 +7397,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7380,9 +7408,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) } if (vmx->nested.smm.guest_mode) { - vcpu->arch.hflags &= ~HF_SMM_MASK; ret = nested_vmx_enter_non_root_mode(vcpu, false); - vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a1e00d0a2482..f879529906b4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -517,4 +517,6 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +void dump_vmcs(void); + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 099b851dabaf..b5edc8e3ce1d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); -/* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int __read_mostly lapic_timer_advance_ns = 1000; +/* + * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables + * adaptive tuning starting from default advancment of 1000ns. '0' disables + * advancement entirely. Any other value is used as-is and disables adaptive + * tuning, i.e. allows priveleged userspace to set an exact advancement time. + */ +static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -800,7 +804,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) { if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && !vcpu->guest_xcr0_loaded) { @@ -810,8 +814,9 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) vcpu->guest_xcr0_loaded = 1; } } +EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) { if (vcpu->guest_xcr0_loaded) { if (vcpu->arch.xcr0 != host_xcr0) @@ -819,6 +824,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) vcpu->guest_xcr0_loaded = 0; } } +EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -3093,7 +3099,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; default: break; @@ -3528,7 +3534,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags); +static void kvm_smm_changed(struct kvm_vcpu *vcpu); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -3588,12 +3594,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - u32 hflags = vcpu->arch.hflags; - if (events->smi.smm) - hflags |= HF_SMM_MASK; - else - hflags &= ~HF_SMM_MASK; - kvm_set_hflags(vcpu, hflags); + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + kvm_smm_changed(vcpu); + } vcpu->arch.smi_pending = events->smi.pending; @@ -4270,7 +4277,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, - u32 kvm_nr_mmu_pages) + unsigned long kvm_nr_mmu_pages) { if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; @@ -4284,7 +4291,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { return kvm->arch.n_max_mmu_pages; } @@ -5958,12 +5965,18 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) { - kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); + emul_to_vcpu(ctxt)->arch.hflags = emul_flags; } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, + const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); +} + +static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + kvm_smm_changed(emul_to_vcpu(ctxt)); } static const struct x86_emulate_ops emulate_ops = { @@ -6006,6 +6019,7 @@ static const struct x86_emulate_ops emulate_ops = { .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, + .post_leave_smm = emulator_post_leave_smm, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6247,16 +6261,6 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) -{ - unsigned changed = vcpu->arch.hflags ^ emul_flags; - - vcpu->arch.hflags = emul_flags; - - if (changed & HF_SMM_MASK) - kvm_smm_changed(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -6535,6 +6539,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; @@ -6551,12 +6561,23 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); + if (ret) + return ret; - if (!ret) { + /* + * Workaround userspace that relies on old KVM behavior of %rip being + * incremented prior to exiting to userspace to handle "OUT 0x7e". + */ + if (port == 0x7e && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) { + vcpu->arch.complete_userspace_io = + complete_fast_pio_out_port_0x7e; + kvm_skip_emulated_instruction(vcpu); + } else { vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } - return ret; + return 0; } static int complete_fast_pio_in(struct kvm_vcpu *vcpu) @@ -7441,9 +7462,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); } +#ifdef CONFIG_X86_64 static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) { -#ifdef CONFIG_X86_64 struct desc_ptr dt; struct kvm_segment seg; unsigned long val; @@ -7493,10 +7514,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) for (i = 0; i < 6; i++) enter_smm_save_seg_64(vcpu, buf, i); -#else - WARN_ON_ONCE(1); -#endif } +#endif static void enter_smm(struct kvm_vcpu *vcpu) { @@ -7507,9 +7526,11 @@ static void enter_smm(struct kvm_vcpu *vcpu) trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); memset(buf, 0, 512); +#ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else +#endif enter_smm_save_state_32(vcpu, buf); /* @@ -7567,8 +7588,10 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); +#ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) kvm_x86_ops->set_efer(vcpu, 0); +#endif kvm_update_cpuid(vcpu); kvm_mmu_reset_context(vcpu); @@ -7865,15 +7888,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - kvm_load_guest_xcr0(vcpu); - if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_x86_ops->request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_timer_advance_ns) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) wait_lapic_expire(vcpu); guest_enter_irqoff(); @@ -7919,8 +7941,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_put_guest_xcr0(vcpu); - kvm_before_interrupt(vcpu); kvm_x86_ops->handle_external_intr(vcpu); kvm_after_interrupt(vcpu); @@ -9063,7 +9083,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - r = kvm_create_lapic(vcpu); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; } else diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 28406aa1136d..534d3f28bb01 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; -extern unsigned int lapic_timer_advance_ns; - extern bool enable_vmware_backdoor; extern struct static_key kvm_no_apic_vcpu; @@ -347,4 +345,6 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } +void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); +void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 986652064b15..5246db42de45 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -6,6 +6,18 @@ # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n +# Early boot use of cmdline; don't instrument it +ifdef CONFIG_AMD_MEM_ENCRYPT +KCOV_INSTRUMENT_cmdline.o := n +KASAN_SANITIZE_cmdline.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_cmdline.o = -pg +endif + +CFLAGS_cmdline.o := $(call cc-option, -fno-stack-protector) +endif + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index db4e5aa0858b..b2f1822084ae 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,6 +16,30 @@ #include <asm/smap.h> #include <asm/export.h> +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(100b, 103b) + _ASM_EXTABLE_UA(101b, 103b) + .endm + /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro @@ -194,6 +218,30 @@ ENDPROC(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) /* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ALIGN; +copy_user_handle_tail: + movl %edx,%ecx +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + ret + + _ASM_EXTABLE_UA(1b, 2b) +ENDPROC(copy_user_handle_tail) + +/* * copy_user_nocache - Uncached memory copy with exception handling * This will force destination out of cache for more performance. * diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 3b24dc05251c..9d05572370ed 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -257,6 +257,7 @@ ENTRY(__memcpy_mcsafe) /* Copy successful. Return zero */ .L_done_memcpy_trap: xorl %eax, %eax +.L_done: ret ENDPROC(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) @@ -273,7 +274,7 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe) addl %edx, %ecx .E_trailing_bytes: mov %ecx, %eax - ret + jmp .L_done /* * For write fault handling, given the destination is unaligned, diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index ee42bb0cbeb3..9952a01cad24 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -55,26 +55,6 @@ unsigned long clear_user(void __user *to, unsigned long n) EXPORT_SYMBOL(clear_user); /* - * Try to copy last bytes and clear the rest if needed. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - */ -__visible unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++) { - char c; - - if (__get_user_nocheck(c, from++, sizeof(char))) - break; - if (__put_user_nocheck(c, to, sizeof(char))) - break; - } - clac(); - return len; -} - -/* * Similar to copy_user_handle_tail, probe for the write fault point, * but reuse __memcpy_mcsafe in case a new read error is encountered. * clac() is handled in _copy_to_iter_mcsafe(). diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ee8f8ab46941..c0309ea9abee 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -259,7 +259,8 @@ static void note_wx(struct pg_state *st) #endif /* Account the WX pages */ st->wx_pages += npages; - WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(__supported_pte_mask & _PAGE_NX, + "x86/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f905a2371080..8dacdb96899e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,6 +5,7 @@ #include <linux/memblock.h> #include <linux/swapfile.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -766,6 +767,11 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) if (debug_pagealloc_enabled()) { pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", begin, end - 1); + /* + * Inform kmemleak about the hole in the memory since the + * corresponding pages will be unmapped. + */ + kmemleak_free_part((void *)begin, end - begin); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0029604af8a4..dd73d5d74393 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -825,7 +825,7 @@ void __init __early_set_fixmap(enum fixed_addresses idx, pte = early_ioremap_pte(addr); /* Sanitize 'prot' against any unsupported bits: */ - pgprot_val(flags) &= __default_kernel_pte_mask; + pgprot_val(flags) &= __supported_pte_mask; if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 3f452ffed7e9..d669c5e797e0 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -94,7 +94,7 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; - kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT); kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; /* diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 139b28a01ce4..d0255d64edce 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -35,6 +35,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> @@ -115,7 +116,8 @@ void __init pti_check_boottime_disable(void) } } - if (cmdline_find_option_bool(boot_command_line, "nopti")) { + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bc4bc7b2f075..487b8474c01c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -728,7 +728,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { int cpu; - struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { + struct flush_tlb_info info = { .mm = mm, .stride_shift = stride_shift, .freed_tables = freed_tables, diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h index 0d766f9c1083..50889935138a 100644 --- a/arch/xtensa/include/asm/tlb.h +++ b/arch/xtensa/include/asm/tlb.h @@ -14,32 +14,6 @@ #include <asm/cache.h> #include <asm/page.h> -#if (DCACHE_WAY_SIZE <= PAGE_SIZE) - -/* Note, read http://lkml.org/lkml/2004/1/15/6 */ - -# define tlb_start_vma(tlb,vma) do { } while (0) -# define tlb_end_vma(tlb,vma) do { } while (0) - -#else - -# define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while(0) - -# define tlb_end_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ - } while(0) - -#endif - -#define __tlb_remove_tlb_entry(tlb,pte,addr) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6af49929de85..30084eaf8422 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -394,3 +394,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register |