diff options
Diffstat (limited to 'arch/x86')
145 files changed, 3009 insertions, 2477 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a72b01b225f2..9b2d50a73a11 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_EBPF_JIT if X86_64 + select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL @@ -109,7 +110,6 @@ config X86 select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS @@ -157,6 +157,7 @@ config X86 select SPARSE_IRQ select SRCU select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select X86_DEV_DMA_OPS if X86_64 @@ -717,7 +718,6 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -730,7 +730,7 @@ config PARAVIRT_SPINLOCKS config QUEUED_LOCK_STAT bool "Paravirt queued spinlock statistics" - depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS + depends on PARAVIRT_SPINLOCKS && DEBUG_FS ---help--- Enable the collection of statistical data on the slowpath behavior of paravirtualized queued spinlocks and report diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 94dd4a31f5b3..cc69e37548db 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -29,22 +29,11 @@ __pure const struct efi_config *__efi_early(void) static void setup_boot_services##bits(struct efi_config *c) \ { \ efi_system_table_##bits##_t *table; \ - efi_boot_services_##bits##_t *bt; \ \ table = (typeof(table))sys_table; \ \ + c->boot_services = table->boottime; \ c->text_output = table->con_out; \ - \ - bt = (typeof(bt))(unsigned long)(table->boottime); \ - \ - c->allocate_pool = bt->allocate_pool; \ - c->allocate_pages = bt->allocate_pages; \ - c->get_memory_map = bt->get_memory_map; \ - c->free_pool = bt->free_pool; \ - c->free_pages = bt->free_pages; \ - c->locate_handle = bt->locate_handle; \ - c->handle_protocol = bt->handle_protocol; \ - c->exit_boot_services = bt->exit_boot_services; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); @@ -286,29 +275,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } } -static void find_bits(unsigned long mask, u8 *pos, u8 *size) -{ - u8 first, len; - - first = 0; - len = 0; - - if (mask) { - while (!(mask & 0x1)) { - mask = mask >> 1; - first++; - } - - while (mask & 0x1) { - mask = mask >> 1; - len++; - } - } - - *pos = first; - *size = len; -} - static efi_status_t __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) { @@ -578,7 +544,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; u32 *handles = (u32 *)uga_handle;; - efi_status_t status; + efi_status_t status = EFI_INVALID_PARAMETER; int i; first_uga = NULL; @@ -623,7 +589,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; u64 *handles = (u64 *)uga_handle;; - efi_status_t status; + efi_status_t status = EFI_INVALID_PARAMETER; int i; first_uga = NULL; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1038524270e7..fd0b6a272dd5 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -82,7 +82,7 @@ ENTRY(efi_pe_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 88(%eax) + add %esi, 32(%eax) pushl %eax call make_boot_params @@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 88(%eax) + add %esi, 32(%eax) pushl %eax 2: call efi_main @@ -264,7 +264,7 @@ relocated: #ifdef CONFIG_EFI_STUB .data efi32_config: - .fill 11,8,0 + .fill 4,8,0 .long efi_call_phys .long 0 .byte 0 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 0d80a7ad65cd..efdfba21a5b2 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -265,7 +265,7 @@ ENTRY(efi_pe_entry) /* * Relocate efi_config->call(). */ - addq %rbp, efi64_config+88(%rip) + addq %rbp, efi64_config+32(%rip) movq %rax, %rdi call make_boot_params @@ -285,7 +285,7 @@ handover_entry: * Relocate efi_config->call(). */ movq efi_config(%rip), %rax - addq %rbp, 88(%rax) + addq %rbp, 32(%rax) 2: movq efi_config(%rip), %rdi call efi_main @@ -457,14 +457,14 @@ efi_config: #ifdef CONFIG_EFI_MIXED .global efi32_config efi32_config: - .fill 11,8,0 + .fill 4,8,0 .quad efi64_thunk .byte 0 #endif .global efi64_config efi64_config: - .fill 11,8,0 + .fill 4,8,0 .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1433f6b4607d..bdd9cc59d20f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -31,13 +31,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} - #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible inline void enter_from_user_mode(void) @@ -71,7 +64,7 @@ static long syscall_trace_enter(struct pt_regs *regs) { u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long ret = 0; bool emulated = false; u32 work; @@ -173,18 +166,17 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Disable IRQs and retry */ local_irq_disable(); - cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(current_thread_info()->flags); if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) break; - } } /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -209,7 +201,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) * special case only applies after poking regs and before the * very next return to user mode. */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); @@ -247,7 +239,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) */ __visible inline void syscall_return_slowpath(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -270,7 +262,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) #ifdef CONFIG_X86_64 __visible void do_syscall_64(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long nr = regs->orig_ax; enter_from_user_mode(); @@ -303,11 +295,11 @@ __visible void do_syscall_64(struct pt_regs *regs) */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; #endif if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0b56666e6039..b75a8bcd2d23 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -204,34 +204,70 @@ POP_GS_EX .endm +/* + * %eax: prev task + * %edx: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in struct inactive_task_frame + */ + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + + /* switch stack */ + movl %esp, TASK_threadsp(%eax) + movl TASK_threadsp(%edx), %esp + +#ifdef CONFIG_CC_STACKPROTECTOR + movl TASK_stack_canary(%edx), %ebx + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + jmp __switch_to +END(__switch_to_asm) + +/* + * A newly forked process directly context switches into this address. + * + * eax: prev task we switched from + * ebx: kernel thread func (NULL for user thread) + * edi: kernel thread arg + */ ENTRY(ret_from_fork) pushl %eax call schedule_tail popl %eax + testl %ebx, %ebx + jnz 1f /* kernel threads are uncommon */ + +2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax call syscall_return_slowpath jmp restore_all -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail - popl %eax - movl PT_EBP(%esp), %eax - call *PT_EBX(%esp) - movl $0, PT_EAX(%esp) + /* kernel thread */ +1: movl %edi, %eax + call *%ebx /* - * Kernel threads return to userspace as if returning from a syscall. - * We should check whether anything actually uses this path and, if so, - * consider switching it over to ret_from_fork. + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. */ - movl %esp, %eax - call syscall_return_slowpath - jmp restore_all -ENDPROC(ret_from_kernel_thread) + movl $0, PT_EAX(%esp) + jmp 2b +END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d172c619c449..fee1d95902b5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_path entry_SYSCALL_64_fastpath: @@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath: */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f LOCKDEP_SYS_EXIT @@ -351,8 +353,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - /* Called from C */ - jmp *%rax /* called from C */ + jmp *%rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -369,41 +370,73 @@ END(ptregs_\func) #include <asm/syscalls_64.h> /* + * %rdi: prev task + * %rsi: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in inactive_task_frame + */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* switch stack */ + movq %rsp, TASK_threadsp(%rdi) + movq TASK_threadsp(%rsi), %rsp + +#ifdef CONFIG_CC_STACKPROTECTOR + movq TASK_stack_canary(%rsi), %rbx + movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + + jmp __switch_to +END(__switch_to_asm) + +/* * A newly forked process directly context switches into this address. * - * rdi: prev task we switched from + * rax: prev task we switched from + * rbx: kernel thread func (NULL for user thread) + * r12: kernel thread arg */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) - + movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ - testb $3, CS(%rsp) /* from kernel_thread? */ - jnz 1f - - /* - * We came from kernel_thread. This code path is quite twisted, and - * someone should clean it up. - * - * copy_thread_tls stashes the function pointer in RBX and the - * parameter to be passed in RBP. The called function is permitted - * to call do_execve and thereby jump to user mode. - */ - movq RBP(%rsp), %rdi - call *RBX(%rsp) - movl $0, RAX(%rsp) + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ - /* - * Fall through as though we're exiting a syscall. This makes a - * twisted sort of sense if we just called do_execve. - */ - -1: +2: movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS jmp restore_regs_and_iret + +1: + /* kernel thread */ + movq %r12, %rdi + call *%rbx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. + */ + movq $0, RAX(%rsp) + jmp 2b END(ret_from_fork) /* @@ -555,27 +588,69 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq %rax - pushq %rdi + /* + * We are running with user GSBASE. All GPRs contain their user + * values. We have a percpu ESPFIX stack that is eight slots + * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom + * of the ESPFIX stack. + * + * We clobber RAX and RDI in this code. We stash RDI on the + * normal stack and RAX on the ESPFIX stack. + * + * The ESPFIX stack layout we set up looks like this: + * + * --- top of ESPFIX stack --- + * SS + * RSP + * RFLAGS + * CS + * RIP <-- RSP points here when we're done + * RAX <-- espfix_waddr points here + * --- bottom of ESPFIX stack --- + */ + + pushq %rdi /* Stash user RDI */ SWAPGS movq PER_CPU_VAR(espfix_waddr), %rdi - movq %rax, (0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ movq %rax, (1*8)(%rdi) - movq (3*8)(%rsp), %rax /* CS */ + movq (2*8)(%rsp), %rax /* user CS */ movq %rax, (2*8)(%rdi) - movq (4*8)(%rsp), %rax /* RFLAGS */ + movq (3*8)(%rsp), %rax /* user RFLAGS */ movq %rax, (3*8)(%rdi) - movq (6*8)(%rsp), %rax /* SS */ + movq (5*8)(%rsp), %rax /* user SS */ movq %rax, (5*8)(%rdi) - movq (5*8)(%rsp), %rax /* RSP */ + movq (4*8)(%rsp), %rax /* user RSP */ movq %rax, (4*8)(%rdi) - andl $0xffff0000, %eax - popq %rdi + /* Now RAX == RSP. */ + + andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ + popq %rdi /* Restore user RDI */ + + /* + * espfix_stack[31:16] == 0. The page tables are set up such that + * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of + * espfix_waddr for any X. That is, there are 65536 RO aliases of + * the same page. Set up RSP so that RSP[31:16] contains the + * respective 16 bits of the /userspace/ RSP and RSP nonetheless + * still points to an RO alias of the ESPFIX stack. + */ orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp - popq %rax + + /* + * At this point, we cannot write to the stack any more, but we can + * still read. + */ + popq %rax /* Restore user RAX */ + + /* + * RSP now points to an ordinary IRET frame, except that the page + * is read-only and RSP[31:16] are preloaded with the userspace + * values. We can now IRET back to userspace. + */ jmp native_irq_return_iret #endif END(common_interrupt) @@ -1002,7 +1077,6 @@ ENTRY(error_entry) testb $3, CS+8(%rsp) jz .Lerror_kernelspace -.Lerror_entry_from_usermode_swapgs: /* * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. @@ -1045,7 +1119,8 @@ ENTRY(error_entry) * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ - jmp .Lerror_entry_from_usermode_swapgs + SWAPGS + jmp .Lerror_entry_done .Lbstep_iret: /* Fix truncated RIP */ diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 4f741192846d..3dab75f2a673 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -22,7 +22,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); - if (hdr->e_type != ET_DYN) + if (GET_LE(&hdr->e_type) != ET_DYN) fail("input is not a shared object\n"); /* Walk the segment table. */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d0efb5cb1b00..d31735f37ed7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -37,6 +37,7 @@ #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> +#include <asm/unwind.h> #include "perf_event.h" @@ -1201,6 +1202,9 @@ static int x86_pmu_add(struct perf_event *event, int flags) * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. + * + * If commit fails, we'll call ->del() on all events + * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; @@ -1223,6 +1227,14 @@ done_collect: cpuc->n_added += n - n0; cpuc->n_txn += n - n0; + if (x86_pmu.add) { + /* + * This is before x86_pmu_enable() will call x86_pmu_start(), + * so we enable LBRs before an event needs them etc.. + */ + x86_pmu.add(event); + } + ret = 0; out: return ret; @@ -1346,7 +1358,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; /* - * If we're called during a txn, we don't need to do anything. + * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * @@ -1354,7 +1366,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) - return; + goto do_del; /* * Not a TXN, therefore cleanup properly. @@ -1384,6 +1396,15 @@ static void x86_pmu_del(struct perf_event *event, int flags) --cpuc->n_events; perf_event_update_userpage(event); + +do_del: + if (x86_pmu.del) { + /* + * This is after x86_pmu_stop(); so we disable LBRs after any + * event can need them etc.. + */ + x86_pmu.del(event); + } } int x86_pmu_handle_irq(struct pt_regs *regs) @@ -2247,39 +2268,26 @@ void arch_perf_update_userpage(struct perf_event *event, cyc2ns_read_end(data); } -/* - * callchain support - */ - -static int backtrace_stack(void *data, char *name) -{ - return 0; -} - -static int backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry_ctx *entry = data; - - return perf_callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack_bp, -}; - void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct unwind_state state; + unsigned long addr; + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } - perf_callchain_store(entry, regs->ip); + if (perf_callchain_store(entry, regs->ip)) + return; - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || perf_callchain_store(entry, addr)) + return; + } } static inline int diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index bdcd6510992c..982c9e31daca 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -455,7 +455,7 @@ int intel_bts_interrupt(void) * The only surefire way of knowing if this NMI is ours is by checking * the write ptr against the PMI threshold. */ - if (ds->bts_index >= ds->bts_interrupt_threshold) + if (ds && (ds->bts_index >= ds->bts_interrupt_threshold)) handled = 1; /* @@ -584,7 +584,8 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; - bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | + PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; bts_pmu.event_init = bts_event_init; bts_pmu.add = bts_event_add; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 4c9a79b9cd69..a3a9eb84b5cf 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1906,13 +1906,6 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); - /* - * must disable before any actual event - * because any event may be combined with LBR - */ - if (needs_branch_stack(event)) - intel_pmu_lbr_disable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -1924,6 +1917,14 @@ static void intel_pmu_disable_event(struct perf_event *event) intel_pmu_pebs_disable(event); } +static void intel_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + intel_pmu_lbr_del(event); + if (event->attr.precise_ip) + intel_pmu_pebs_del(event); +} + static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { int idx = hwc->idx - INTEL_PMC_IDX_FIXED; @@ -1967,12 +1968,6 @@ static void intel_pmu_enable_event(struct perf_event *event) intel_pmu_enable_bts(hwc->config); return; } - /* - * must enabled before any actual event - * because any event may be combined with LBR - */ - if (needs_branch_stack(event)) - intel_pmu_lbr_enable(event); if (event->attr.exclude_host) cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); @@ -1993,6 +1988,14 @@ static void intel_pmu_enable_event(struct perf_event *event) __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } +static void intel_pmu_add_event(struct perf_event *event) +{ + if (event->attr.precise_ip) + intel_pmu_pebs_add(event); + if (needs_branch_stack(event)) + intel_pmu_lbr_add(event); +} + /* * Save and restart an expired event. Called by NMI contexts, * so it has to be careful about preempting normal event ops: @@ -3291,6 +3294,8 @@ static __initconst const struct x86_pmu intel_pmu = { .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_event, .disable = intel_pmu_disable_event, + .add = intel_pmu_add_event, + .del = intel_pmu_del_event, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9b983a474253..0319311dbdbb 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -806,9 +806,65 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) return &emptyconstraint; } -static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc) +/* + * We need the sched_task callback even for per-cpu events when we use + * the large interrupt threshold, such that we can provide PID and TID + * to PEBS samples. + */ +static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) +{ + return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); +} + +static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) +{ + struct debug_store *ds = cpuc->ds; + u64 threshold; + + if (cpuc->n_pebs == cpuc->n_large_pebs) { + threshold = ds->pebs_absolute_maximum - + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + } else { + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + } + + ds->pebs_interrupt_threshold = threshold; +} + +static void +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) +{ + /* + * Make sure we get updated with the first PEBS + * event. It will trigger also during removal, but + * that does not hurt: + */ + bool update = cpuc->n_pebs == 1; + + if (needed_cb != pebs_needs_sched_cb(cpuc)) { + if (!needed_cb) + perf_sched_cb_inc(pmu); + else + perf_sched_cb_dec(pmu); + + update = true; + } + + if (update) + pebs_update_threshold(cpuc); +} + +void intel_pmu_pebs_add(struct perf_event *event) { - return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1)); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + bool needed_cb = pebs_needs_sched_cb(cpuc); + + cpuc->n_pebs++; + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + cpuc->n_large_pebs++; + + pebs_update_state(needed_cb, cpuc, event->ctx->pmu); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -816,12 +872,9 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; - bool first_pebs; - u64 threshold; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - first_pebs = !pebs_is_enabled(cpuc); cpuc->pebs_enabled |= 1ULL << hwc->idx; if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) @@ -830,46 +883,34 @@ void intel_pmu_pebs_enable(struct perf_event *event) cpuc->pebs_enabled |= 1ULL << 63; /* - * When the event is constrained enough we can use a larger - * threshold and run the event with less frequent PMI. + * Use auto-reload if possible to save a MSR write in the PMI. + * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { - threshold = ds->pebs_absolute_maximum - - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; - - if (first_pebs) - perf_sched_cb_inc(event->ctx->pmu); - } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; - - /* - * If not all events can use larger buffer, - * roll back to threshold = 1 - */ - if (!first_pebs && - (ds->pebs_interrupt_threshold > threshold)) - perf_sched_cb_dec(event->ctx->pmu); - } - - /* Use auto-reload if possible to save a MSR write in the PMI */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { ds->pebs_event_reset[hwc->idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } +} + +void intel_pmu_pebs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + bool needed_cb = pebs_needs_sched_cb(cpuc); - if (first_pebs || ds->pebs_interrupt_threshold > threshold) - ds->pebs_interrupt_threshold = threshold; + cpuc->n_pebs--; + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + cpuc->n_large_pebs--; + + pebs_update_state(needed_cb, cpuc, event->ctx->pmu); } void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - struct debug_store *ds = cpuc->ds; - bool large_pebs = ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size; - if (large_pebs) + if (cpuc->n_pebs == cpuc->n_large_pebs) intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -879,9 +920,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); - if (large_pebs && !pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 707d358e0dff..fc6cf21c535e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -380,7 +380,6 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; /* @@ -390,31 +389,21 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) */ task_ctx = ctx ? ctx->task_ctx_data : NULL; if (task_ctx) { - if (sched_in) { + if (sched_in) __intel_pmu_lbr_restore(task_ctx); - cpuc->lbr_context = ctx; - } else { + else __intel_pmu_lbr_save(task_ctx); - } return; } /* - * When sampling the branck stack in system-wide, it may be - * necessary to flush the stack on context switch. This happens - * when the branch stack does not tag its entries with the pid - * of the current task. Otherwise it becomes impossible to - * associate a branch entry with a task. This ambiguity is more - * likely to appear when the branch stack supports priv level - * filtering and the user sets it to monitor only at the user - * level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch - * stack with branch from multiple tasks. - */ - if (sched_in) { + * Since a context switch can flip the address space and LBR entries + * are not tagged with an identifier, we need to wipe the LBR, even for + * per-cpu events. You simply cannot resolve the branches from the old + * address space. + */ + if (sched_in) intel_pmu_lbr_reset(); - cpuc->lbr_context = ctx; - } } static inline bool branch_user_callstack(unsigned br_sel) @@ -422,7 +411,7 @@ static inline bool branch_user_callstack(unsigned br_sel) return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); } -void intel_pmu_lbr_enable(struct perf_event *event) +void intel_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; @@ -430,27 +419,38 @@ void intel_pmu_lbr_enable(struct perf_event *event) if (!x86_pmu.lbr_nr) return; - /* - * Reset the LBR stack if we changed task context to - * avoid data leaks. - */ - if (event->ctx->task && cpuc->lbr_context != event->ctx) { - intel_pmu_lbr_reset(); - cpuc->lbr_context = event->ctx; - } cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx && - event->ctx->task_ctx_data) { + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { task_ctx = event->ctx->task_ctx_data; task_ctx->lbr_callstack_users++; } - cpuc->lbr_users++; + /* + * Request pmu::sched_task() callback, which will fire inside the + * regular perf event scheduling, so that call will: + * + * - restore or wipe; when LBR-callstack, + * - wipe; otherwise, + * + * when this is from __perf_event_task_sched_in(). + * + * However, if this is from perf_install_in_context(), no such callback + * will follow and we'll need to reset the LBR here if this is the + * first LBR event. + * + * The problem is, we cannot tell these cases apart... but we can + * exclude the biggest chunk of cases by looking at + * event->total_time_running. An event that has accrued runtime cannot + * be 'new'. Conversely, a new event can get installed through the + * context switch path for the first time. + */ perf_sched_cb_inc(event->ctx->pmu); + if (!cpuc->lbr_users++ && !event->total_time_running) + intel_pmu_lbr_reset(); } -void intel_pmu_lbr_disable(struct perf_event *event) +void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; @@ -467,12 +467,6 @@ void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); perf_sched_cb_dec(event->ctx->pmu); - - if (cpuc->enabled && !cpuc->lbr_users) { - __intel_pmu_lbr_disable(); - /* avoid stale pointer */ - cpuc->lbr_context = NULL; - } } void intel_pmu_lbr_enable_all(bool pmi) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 861a7d9cb60f..c5047b8f777b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -69,6 +69,8 @@ static struct pt_cap_desc { PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), PT_CAP(mtc, 0, CR_EBX, BIT(3)), + PT_CAP(ptwrite, 0, CR_EBX, BIT(4)), + PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), @@ -259,10 +261,16 @@ fail: #define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ RTIT_CTL_MTC_RANGE) +#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ + RTIT_CTL_FUP_ON_PTW) + #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ RTIT_CTL_DISRETC | \ RTIT_CTL_CYC_PSB | \ - RTIT_CTL_MTC) + RTIT_CTL_MTC | \ + RTIT_CTL_PWR_EVT_EN | \ + RTIT_CTL_FUP_ON_PTW | \ + RTIT_CTL_PTW_EN) static bool pt_event_valid(struct perf_event *event) { @@ -311,6 +319,20 @@ static bool pt_event_valid(struct perf_event *event) return false; } + if (config & RTIT_CTL_PWR_EVT_EN && + !pt_cap_get(PT_CAP_power_event_trace)) + return false; + + if (config & RTIT_CTL_PTW) { + if (!pt_cap_get(PT_CAP_ptwrite)) + return false; + + /* FUPonPTW without PTW doesn't make sense */ + if ((config & RTIT_CTL_FUP_ON_PTW) && + !(config & RTIT_CTL_PTW_EN)) + return false; + } + return true; } diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index efffa4a09f68..53473c21b554 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -26,11 +26,14 @@ #define RTIT_CTL_CYCLEACC BIT(1) #define RTIT_CTL_OS BIT(2) #define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_EN BIT(4) +#define RTIT_CTL_FUP_ON_PTW BIT(5) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) #define RTIT_CTL_MTC_EN BIT(9) #define RTIT_CTL_TSC_EN BIT(10) #define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_EN BIT(12) #define RTIT_CTL_BRANCH_EN BIT(13) #define RTIT_CTL_MTC_RANGE_OFFSET 14 #define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) @@ -91,6 +94,8 @@ enum pt_capabilities { PT_CAP_psb_cyc, PT_CAP_ip_filtering, PT_CAP_mtc, + PT_CAP_ptwrite, + PT_CAP_power_event_trace, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 28865938aadf..b0f0e835a770 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -357,6 +357,8 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + /* * check event is known (determines counter) */ @@ -765,6 +767,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 463dc7a5a6c3..d9844cc74486 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -664,6 +664,8 @@ static int uncore_pmu_event_init(struct perf_event *event) event->cpu = box->cpu; event->pmu_private = box; + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; @@ -683,7 +685,8 @@ static int uncore_pmu_event_init(struct perf_event *event) /* fixed counters have event field hardcoded to zero */ hwc->config = 0ULL; } else { - hwc->config = event->attr.config & pmu->type->event_mask; + hwc->config = event->attr.config & + (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32)); if (pmu->type->ops->hw_config) { ret = pmu->type->ops->hw_config(box, event); if (ret) @@ -1321,6 +1324,11 @@ static const struct intel_uncore_init_fun skl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun skx_uncore_init __initconst = { + .cpu_init = skx_uncore_cpu_init, + .pci_init = skx_uncore_pci_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1343,6 +1351,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 78b9c23e2d8d..ad986c1e29bc 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -44,6 +44,7 @@ struct intel_uncore_type { unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; + unsigned event_mask_ext; unsigned fixed_ctr; unsigned fixed_ctl; unsigned box_ctl; @@ -120,6 +121,7 @@ struct intel_uncore_box { }; #define UNCORE_BOX_FLAG_INITIATED 0 +#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */ struct uncore_event_desc { struct kobj_attribute attr; @@ -172,6 +174,9 @@ static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) static inline unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) { + if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags)) + return idx * 8 + box->pmu->type->event_ctl; + return idx * 4 + box->pmu->type->event_ctl; } @@ -377,6 +382,8 @@ int bdx_uncore_pci_init(void); void bdx_uncore_cpu_init(void); int knl_uncore_pci_init(void); void knl_uncore_cpu_init(void); +int skx_uncore_pci_init(void); +void skx_uncore_cpu_init(void); /* perf_event_intel_uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 9d35ec0cb8fc..5f845eef9a4d 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -388,6 +388,8 @@ static int snb_uncore_imc_event_init(struct perf_event *event) event->cpu = box->cpu; event->pmu_private = box; + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 8aee83bcf71f..272427700d48 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,6 +1,10 @@ /* SandyBridge-EP/IvyTown uncore support */ #include "uncore.h" +/* SNB-EP pci bus to socket mapping */ +#define SNBEP_CPUNODEID 0x40 +#define SNBEP_GIDNIDMAP 0x54 + /* SNB-EP Box level control */ #define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) #define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) @@ -264,15 +268,72 @@ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) +/* SKX pci bus to socket mapping */ +#define SKX_CPUNODEID 0xc0 +#define SKX_GIDNIDMAP 0xd4 + +/* SKX CHA */ +#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) +#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) +#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE (0x3ffULL << 17) +#define SKX_CHA_MSR_PMON_BOX_FILTER_REM (0x1ULL << 32) +#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC (0x1ULL << 33) +#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC (0x1ULL << 35) +#define SKX_CHA_MSR_PMON_BOX_FILTER_NM (0x1ULL << 36) +#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM (0x1ULL << 37) +#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0 (0x3ffULL << 41) +#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1 (0x3ffULL << 51) +#define SKX_CHA_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) +#define SKX_CHA_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) +#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63) + +/* SKX IIO */ +#define SKX_IIO0_MSR_PMON_CTL0 0xa48 +#define SKX_IIO0_MSR_PMON_CTR0 0xa41 +#define SKX_IIO0_MSR_PMON_BOX_CTL 0xa40 +#define SKX_IIO_MSR_OFFSET 0x20 + +#define SKX_PMON_CTL_TRESH_MASK (0xff << 24) +#define SKX_PMON_CTL_TRESH_MASK_EXT (0xf) +#define SKX_PMON_CTL_CH_MASK (0xff << 4) +#define SKX_PMON_CTL_FC_MASK (0x7 << 12) +#define SKX_IIO_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SKX_PMON_CTL_TRESH_MASK) +#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT (SKX_PMON_CTL_TRESH_MASK_EXT | \ + SKX_PMON_CTL_CH_MASK | \ + SKX_PMON_CTL_FC_MASK) + +/* SKX IRP */ +#define SKX_IRP0_MSR_PMON_CTL0 0xa5b +#define SKX_IRP0_MSR_PMON_CTR0 0xa59 +#define SKX_IRP0_MSR_PMON_BOX_CTL 0xa58 +#define SKX_IRP_MSR_OFFSET 0x20 + +/* SKX UPI */ +#define SKX_UPI_PCI_PMON_CTL0 0x350 +#define SKX_UPI_PCI_PMON_CTR0 0x318 +#define SKX_UPI_PCI_PMON_BOX_CTL 0x378 +#define SKX_PMON_CTL_UMASK_EXT 0xff + +/* SKX M2M */ +#define SKX_M2M_PCI_PMON_CTL0 0x228 +#define SKX_M2M_PCI_PMON_CTR0 0x200 +#define SKX_M2M_PCI_PMON_BOX_CTL 0x258 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35"); DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29"); DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); @@ -280,6 +341,8 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); +DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43"); +DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); @@ -288,18 +351,26 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26"); +DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32"); +DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36"); +DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37"); DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33"); DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35"); DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60"); DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62"); DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61"); DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63"); @@ -1153,7 +1224,7 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static int snbep_pci2phy_map_init(int devid) +static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid, segment; @@ -1168,12 +1239,12 @@ static int snbep_pci2phy_map_init(int devid) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); if (err) break; nodeid = config; /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, idmap_loc, &config); if (err) break; @@ -1207,11 +1278,20 @@ static int snbep_pci2phy_map_init(int devid) raw_spin_lock(&pci2phy_map_lock); list_for_each_entry(map, &pci2phy_map_head, list) { i = -1; - for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; - else - map->pbus_to_physid[bus] = i; + if (reverse) { + for (bus = 255; bus >= 0; bus--) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } + } else { + for (bus = 0; bus <= 255; bus++) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } } } raw_spin_unlock(&pci2phy_map_lock); @@ -1224,7 +1304,7 @@ static int snbep_pci2phy_map_init(int devid) int snbep_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x3ce0); + int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; uncore_pci_uncores = snbep_pci_uncores; @@ -1788,7 +1868,7 @@ static struct pci_driver ivbep_uncore_pci_driver = { int ivbep_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x0e1e); + int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; uncore_pci_uncores = ivbep_pci_uncores; @@ -2897,7 +2977,7 @@ static struct pci_driver hswep_uncore_pci_driver = { int hswep_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x2f1e); + int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; uncore_pci_uncores = hswep_pci_uncores; @@ -3186,7 +3266,7 @@ static struct pci_driver bdx_uncore_pci_driver = { int bdx_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x6f1e); + int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; @@ -3196,3 +3276,525 @@ int bdx_uncore_pci_init(void) } /* end of BDX uncore support */ + +/* SKX uncore support */ + +static struct intel_uncore_type skx_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_ubox_format_group, +}; + +static struct attribute *skx_uncore_cha_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid4.attr, + &format_attr_filter_link4.attr, + &format_attr_filter_state5.attr, + &format_attr_filter_rem.attr, + &format_attr_filter_loc.attr, + &format_attr_filter_nm.attr, + &format_attr_filter_all_op.attr, + &format_attr_filter_not_nm.attr, + &format_attr_filter_opc_0.attr, + &format_attr_filter_opc_1.attr, + &format_attr_filter_nc.attr, + &format_attr_filter_c6.attr, + &format_attr_filter_isoc.attr, + NULL, +}; + +static struct attribute_group skx_uncore_chabox_format_group = { + .name = "format", + .attrs = skx_uncore_cha_formats_attr, +}; + +static struct event_constraint skx_uncore_chabox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg skx_uncore_cha_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4), +}; + +static u64 skx_cha_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK; + if (fields & 0x4) + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE; + return mask; +} + +static struct event_constraint * +skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask); +} + +static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = skx_uncore_cha_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + + HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & skx_cha_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static struct intel_uncore_ops skx_uncore_chabox_ops = { + /* There is no frz_en for chabox ctl */ + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = hswep_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = skx_cha_hw_config, + .get_constraint = skx_cha_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type skx_uncore_chabox = { + .name = "cha", + .num_counters = 4, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = skx_uncore_chabox_constraints, + .ops = &skx_uncore_chabox_ops, + .format_group = &skx_uncore_chabox_format_group, +}; + +static struct attribute *skx_uncore_iio_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh9.attr, + &format_attr_ch_mask.attr, + &format_attr_fc_mask.attr, + NULL, +}; + +static struct attribute_group skx_uncore_iio_format_group = { + .name = "format", + .attrs = skx_uncore_iio_formats_attr, +}; + +static struct event_constraint skx_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), + UNCORE_EVENT_CONSTRAINT(0x95, 0xc), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + EVENT_CONSTRAINT_END +}; + +static void skx_iio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops skx_uncore_iio_ops = { + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = skx_iio_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type skx_uncore_iio = { + .name = "iio", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SKX_IIO0_MSR_PMON_CTL0, + .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, + .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SKX_IIO0_MSR_PMON_BOX_CTL, + .msr_offset = SKX_IIO_MSR_OFFSET, + .constraints = skx_uncore_iio_constraints, + .ops = &skx_uncore_iio_ops, + .format_group = &skx_uncore_iio_format_group, +}; + +static struct attribute *skx_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group skx_uncore_format_group = { + .name = "format", + .attrs = skx_uncore_formats_attr, +}; + +static struct intel_uncore_type skx_uncore_irp = { + .name = "irp", + .num_counters = 2, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SKX_IRP0_MSR_PMON_CTL0, + .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SKX_IRP0_MSR_PMON_BOX_CTL, + .msr_offset = SKX_IRP_MSR_OFFSET, + .ops = &skx_uncore_iio_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct intel_uncore_ops skx_uncore_pcu_ops = { + IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = hswep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type skx_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0, + .event_ctl = HSWEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &skx_uncore_pcu_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *skx_msr_uncores[] = { + &skx_uncore_ubox, + &skx_uncore_chabox, + &skx_uncore_iio, + &skx_uncore_irp, + &skx_uncore_pcu, + NULL, +}; + +static int skx_count_chabox(void) +{ + struct pci_dev *chabox_dev = NULL; + int bus, count = 0; + + while (1) { + chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev); + if (!chabox_dev) + break; + if (count == 0) + bus = chabox_dev->bus->number; + if (bus != chabox_dev->bus->number) + break; + count++; + } + + pci_dev_put(chabox_dev); + return count; +} + +void skx_uncore_cpu_init(void) +{ + skx_uncore_chabox.num_boxes = skx_count_chabox(); + uncore_msr_uncores = skx_msr_uncores; +} + +static struct intel_uncore_type skx_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = hswep_uncore_imc_events, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct attribute *skx_upi_uncore_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask_ext.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group skx_upi_uncore_format_group = { + .name = "format", + .attrs = skx_upi_uncore_formats_attr, +}; + +static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT); +} + +static struct intel_uncore_ops skx_upi_uncore_pci_ops = { + .init_box = skx_upi_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type skx_uncore_upi = { + .name = "upi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SKX_UPI_PCI_PMON_CTR0, + .event_ctl = SKX_UPI_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_PMON_CTL_UMASK_EXT, + .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, + .ops = &skx_upi_uncore_pci_ops, + .format_group = &skx_upi_uncore_format_group, +}; + +static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT); +} + +static struct intel_uncore_ops skx_m2m_uncore_pci_ops = { + .init_box = skx_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type skx_uncore_m2m = { + .name = "m2m", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SKX_M2M_PCI_PMON_CTR0, + .event_ctl = SKX_M2M_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SKX_M2M_PCI_PMON_BOX_CTL, + .ops = &skx_m2m_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct event_constraint skx_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type skx_uncore_m2pcie = { + .name = "m2pcie", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .constraints = skx_uncore_m2pcie_constraints, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct event_constraint skx_uncore_m3upi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x1d, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1e, 0x1), + UNCORE_EVENT_CONSTRAINT(0x40, 0x7), + UNCORE_EVENT_CONSTRAINT(0x4e, 0x7), + UNCORE_EVENT_CONSTRAINT(0x4f, 0x7), + UNCORE_EVENT_CONSTRAINT(0x50, 0x7), + UNCORE_EVENT_CONSTRAINT(0x51, 0x7), + UNCORE_EVENT_CONSTRAINT(0x52, 0x7), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type skx_uncore_m3upi = { + .name = "m3upi", + .num_counters = 3, + .num_boxes = 3, + .perf_ctr_bits = 48, + .constraints = skx_uncore_m3upi_constraints, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +enum { + SKX_PCI_UNCORE_IMC, + SKX_PCI_UNCORE_M2M, + SKX_PCI_UNCORE_UPI, + SKX_PCI_UNCORE_M2PCIE, + SKX_PCI_UNCORE_M3UPI, +}; + +static struct intel_uncore_type *skx_pci_uncores[] = { + [SKX_PCI_UNCORE_IMC] = &skx_uncore_imc, + [SKX_PCI_UNCORE_M2M] = &skx_uncore_m2m, + [SKX_PCI_UNCORE_UPI] = &skx_uncore_upi, + [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie, + [SKX_PCI_UNCORE_M3UPI] = &skx_uncore_m3upi, + NULL, +}; + +static const struct pci_device_id skx_uncore_pci_ids[] = { + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1), + }, + { /* MC0 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5), + }, + { /* M2M0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0), + }, + { /* M2M1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1), + }, + { /* UPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0), + }, + { /* UPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1), + }, + { /* UPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2), + }, + { /* M2PCIe 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0), + }, + { /* M2PCIe 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1), + }, + { /* M2PCIe 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2), + }, + { /* M2PCIe 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3), + }, + { /* M3UPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0), + }, + { /* M3UPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1), + }, + { /* M3UPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2), + }, + { /* end: all zeroes */ } +}; + + +static struct pci_driver skx_uncore_pci_driver = { + .name = "skx_uncore", + .id_table = skx_uncore_pci_ids, +}; + +int skx_uncore_pci_init(void) +{ + /* need to double check pci address */ + int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false); + + if (ret) + return ret; + + uncore_pci_uncores = skx_pci_uncores; + uncore_pci_driver = &skx_uncore_pci_driver; + return 0; +} + +/* end of SKX uncore support */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8c4a47706296..5874d8de1f8d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -194,12 +194,13 @@ struct cpu_hw_events { */ struct debug_store *ds; u64 pebs_enabled; + int n_pebs; + int n_large_pebs; /* * Intel LBR bits */ int lbr_users; - void *lbr_context; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; @@ -508,6 +509,8 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*add)(struct perf_event *); + void (*del)(struct perf_event *); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -888,6 +891,10 @@ extern struct event_constraint intel_skl_pebs_event_constraints[]; struct event_constraint *intel_pebs_constraints(struct perf_event *event); +void intel_pmu_pebs_add(struct perf_event *event); + +void intel_pmu_pebs_del(struct perf_event *event); + void intel_pmu_pebs_enable(struct perf_event *event); void intel_pmu_pebs_disable(struct perf_event *event); @@ -906,9 +913,9 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); -void intel_pmu_lbr_enable(struct perf_event *event); +void intel_pmu_lbr_add(struct perf_event *event); -void intel_pmu_lbr_disable(struct perf_event *event); +void intel_pmu_lbr_del(struct perf_event *event); void intel_pmu_lbr_enable_all(bool pmi); diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e77a6443104f..1b020381ab38 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -217,10 +217,14 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ +{ \ + register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) + : output, "+r" (__sp) \ + : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input); \ +} /* * use this macro(s) if you need more than one output parameter diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 124357773ffa..f5aaf6c83222 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -650,8 +650,8 @@ static inline void entering_ack_irq(void) static inline void ipi_entering_ack_irq(void) { - ack_APIC_irq(); irq_enter(); + ack_APIC_irq(); } static inline void exiting_irq(void) @@ -661,9 +661,8 @@ static inline void exiting_irq(void) static inline void exiting_ack_irq(void) { - irq_exit(); - /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); + irq_exit(); } extern void ioapic_zap_locks(void); diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 9733361fed6f..97848cdfcb1a 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -158,53 +158,9 @@ extern void __add_wrong_size(void) * value of "*ptr". * * xadd() is locked when multiple CPUs are online - * xadd_sync() is always locked - * xadd_local() is never locked */ #define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) #define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") -#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") - -#define __add(ptr, inc, lock) \ - ({ \ - __typeof__ (*(ptr)) __ret = (inc); \ - switch (sizeof(*(ptr))) { \ - case __X86_CASE_B: \ - asm volatile (lock "addb %b1, %0\n" \ - : "+m" (*(ptr)) : "qi" (inc) \ - : "memory", "cc"); \ - break; \ - case __X86_CASE_W: \ - asm volatile (lock "addw %w1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ - break; \ - case __X86_CASE_L: \ - asm volatile (lock "addl %1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ - break; \ - case __X86_CASE_Q: \ - asm volatile (lock "addq %1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ - break; \ - default: \ - __add_wrong_size(); \ - } \ - __ret; \ - }) - -/* - * add_*() adds "inc" to "*ptr" - * - * __add() takes a lock prefix - * add_smp() is locked when multiple CPUs are online - * add_sync() is always locked - */ -#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX) -#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ") #define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ ({ \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 92a8308b96f6..1188bc849ee3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -106,7 +106,6 @@ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4e10d73cf018..12080d87da3b 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; -extern struct desc_ptr debug_idt_descr; +extern const struct desc_ptr debug_idt_descr; extern gate_desc debug_idt_table[]; struct gdt_page { diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3ab0537872fb..476b574de99e 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -10,8 +10,8 @@ #include <uapi/asm/e820.h> #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ -extern struct e820map e820; -extern struct e820map e820_saved; +extern struct e820map *e820; +extern struct e820map *e820_saved; extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); @@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +extern void e820_reallocate_tables(void); + /* * Returns true iff the specified range [s,e) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index d0bb76d81402..389d700b961e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -117,7 +117,6 @@ extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); extern void __init efi_print_memmap(void); -extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); @@ -192,14 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( struct efi_config { u64 image_handle; u64 table; - u64 allocate_pool; - u64 allocate_pages; - u64 get_memory_map; - u64 free_pool; - u64 free_pages; - u64 locate_handle; - u64 handle_protocol; - u64 exit_boot_services; + u64 boot_services; u64 text_output; efi_status_t (*call)(unsigned long, ...); bool is64; @@ -207,14 +199,27 @@ struct efi_config { __pure const struct efi_config *__efi_early(void); +static inline bool efi_is_64bit(void) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return false; + + if (!IS_ENABLED(CONFIG_EFI_MIXED)) + return true; + + return __efi_early()->is64; +} + #define efi_call_early(f, ...) \ - __efi_early()->call(__efi_early()->f, __VA_ARGS__); + __efi_early()->call(efi_is_64bit() ? \ + ((efi_boot_services_64_t *)(unsigned long) \ + __efi_early()->boot_services)->f : \ + ((efi_boot_services_32_t *)(unsigned long) \ + __efi_early()->boot_services)->f, __VA_ARGS__) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); -#define efi_is_64bit() __efi_early()->is64 - extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index ae55a43e09c0..d4957ac72b48 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -45,7 +45,8 @@ extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); +extern void __init update_regset_xstate_info(unsigned int size, + u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a4820d4df617..eccd0ac6bc38 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -6,6 +6,7 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else # define MCOUNT_ADDR ((unsigned long)(mcount)) +# define HAVE_FUNCTION_GRAPH_FP_TEST #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -13,6 +14,8 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifndef __ASSEMBLY__ extern void mcount(void); extern atomic_t modifying_ftrace_code; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 055ea9941dd5..67942b6ad4b7 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -43,6 +43,9 @@ struct hypervisor_x86 { /* X2APIC detection (run once per boot) */ bool (*x2apic_available)(void); + + /* pin current vcpu to specified physical cpu (run rarely) */ + void (*pin_vcpu)(int); }; extern const struct hypervisor_x86 *x86_hyper; @@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); +extern void hypervisor_pin_vcpu(int cpu); #else static inline void init_hypervisor(struct cpuinfo_x86 *c) { } static inline void init_hypervisor_platform(void) { } diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 2674ee3de748..1052a797d71d 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY extern unsigned long page_offset_base; extern unsigned long vmalloc_base; +extern unsigned long vmemmap_base; void kernel_randomize_memory(void); #else diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 1ef9d581b5d9..d31881188431 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -24,8 +24,6 @@ enum die_val { extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); -extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8bf766ef0e18..9bd7ff5ffbcc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,9 +40,10 @@ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ /* AMD-specific bits */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ +#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ -#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ /* * McaX field if set indicates a given bank supports MCA extensions: @@ -110,6 +111,7 @@ #define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 #define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 +#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006 #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a @@ -119,6 +121,7 @@ #define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) @@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected, * Scalable MCA. */ #ifdef CONFIG_X86_MCE_AMD -enum amd_ip_types { - SMCA_F17H_CORE = 0, /* Core errors */ - SMCA_DF, /* Data Fabric */ - SMCA_UMC, /* Unified Memory Controller */ - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ - SMCA_SMU, /* System Management Unit */ - N_AMD_IP_TYPES -}; - -struct amd_hwid { - const char *name; - unsigned int hwid; -}; - -extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; -enum amd_core_mca_blocks { +/* These may be used by multiple smca_hwid_mcatypes */ +enum smca_bank_types { SMCA_LS = 0, /* Load Store */ SMCA_IF, /* Instruction Fetch */ - SMCA_L2_CACHE, /* L2 cache */ - SMCA_DE, /* Decoder unit */ - RES, /* Reserved */ - SMCA_EX, /* Execution unit */ + SMCA_L2_CACHE, /* L2 Cache */ + SMCA_DE, /* Decoder Unit */ + SMCA_EX, /* Execution Unit */ SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 cache */ - N_CORE_MCA_BLOCKS + SMCA_L3_CACHE, /* L3 Cache */ + SMCA_CS, /* Coherent Slave */ + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_SMU, /* System Management Unit */ + N_SMCA_BANK_TYPES }; -extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES]; + +#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype) -enum amd_df_mca_blocks { - SMCA_CS = 0, /* Coherent Slave */ - SMCA_PIE, /* Power management, Interrupts, etc */ - N_DF_BLOCKS +struct smca_hwid_mcatype { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ + u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ }; -extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; +struct smca_bank_info { + struct smca_hwid_mcatype *type; + u32 type_instance; +}; + +extern struct smca_bank_info smca_banks[MAX_NR_BANKS]; + #endif #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index b07233b64578..32007041ef8c 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,7 +6,6 @@ #include <asm/x86_init.h> #include <asm/apicdef.h> -extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; +extern u8 boot_cpu_apic_version; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); +int __generic_processor_info(int apicid, int version, bool enabled); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2970d22d7766..ce932812f142 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -80,10 +80,6 @@ static inline unsigned long __read_cr4(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); } -static inline unsigned long __read_cr4_safe(void) -{ - return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); -} static inline void __write_cr4(unsigned long x) { @@ -661,8 +657,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -#ifdef CONFIG_QUEUED_SPINLOCKS - static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { @@ -684,22 +678,6 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } -#else /* !CONFIG_QUEUED_SPINLOCKS */ - -static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, - __ticket_t ticket) -{ - PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); -} - -static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, - __ticket_t ticket) -{ - PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); -} - -#endif /* CONFIG_QUEUED_SPINLOCKS */ - #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7fa9e7740ba3..0f400c0e4979 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -108,7 +108,6 @@ struct pv_cpu_ops { unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); - unsigned long (*read_cr4_safe)(void); unsigned long (*read_cr4)(void); void (*write_cr4)(unsigned long); @@ -301,23 +300,16 @@ struct pv_mmu_ops { struct arch_spinlock; #ifdef CONFIG_SMP #include <asm/spinlock_types.h> -#else -typedef u16 __ticket_t; #endif struct qspinlock; struct pv_lock_ops { -#ifdef CONFIG_QUEUED_SPINLOCKS void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); struct paravirt_callee_save queued_spin_unlock; void (*wait)(u8 *ptr, u8 val); void (*kick)(int cpu); -#else /* !CONFIG_QUEUED_SPINLOCKS */ - struct paravirt_callee_save lock_spinning; - void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); -#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6fdef9eef2d5..3a264200c62f 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t; #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_SIZE_TB _AC(32, UL) #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) #ifdef CONFIG_RANDOMIZE_MEMORY #define VMALLOC_START vmalloc_base +#define VMEMMAP_START vmemmap_base #else #define VMALLOC_START __VMALLOC_BASE +#define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 643eba42d620..2c1ebeb4d737 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) { - if (static_cpu_has(X86_FEATURE_MCE_RECOVERY)) - return memcpy_mcsafe(dst, src, n); - memcpy(dst, src, n); - return 0; + return memcpy_mcsafe(dst, src, n); } /** diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 63def9537a2d..984a7bf17f6a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -389,9 +389,9 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif -#ifdef CONFIG_X86_32 - unsigned long ip; -#endif + + u32 status; /* thread synchronous flags */ + #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; @@ -438,6 +438,15 @@ struct thread_struct { }; /* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + +/* * Set IOPL bits in EFLAGS from given mask */ static inline void native_set_iopl_mask(unsigned mask) @@ -724,8 +733,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -extern unsigned long thread_saved_pc(struct task_struct *tsk); - /* * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" @@ -776,17 +783,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); .addr_limit = KERNEL_DS, \ } -/* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8)) - #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ +extern unsigned long thread_saved_pc(struct task_struct *tsk); + extern void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index b2988c0ed829..230e1903acf0 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -44,9 +44,9 @@ struct trampoline_header { extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; -extern unsigned long init_rsp; extern unsigned long initial_code; extern unsigned long initial_gs; +extern unsigned long initial_stack; extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 8dbc762ad132..3d33a719f5c1 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -154,7 +154,7 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem) : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), CC_OUT(e) (result) : "er" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + : "memory"); return result; } diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ebd0c164cd4e..19980b36f394 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); #endif -/* Static state in head.S used to set up a CPU */ -extern unsigned long stack_start; /* Initial stack pointer address */ - struct task_struct; struct smp_ops { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 587d7914ea4b..19a2224f9e16 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -59,22 +59,19 @@ static inline void native_write_cr3(unsigned long val) static inline unsigned long native_read_cr4(void) { unsigned long val; - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist. In x86_64, a cr4 always - * exists, so it will never fail. */ #ifdef CONFIG_X86_32 + /* + * This could fault if CR4 does not exist. Non-existent CR4 + * is functionally equivalent to CR4 == 0. Keep it simple and pretend + * that CR4 == 0 on CPUs that don't have CR4. + */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val), "=m" (__force_order) : "0" (0)); #else - val = native_read_cr4(); + /* CR4 always exists on x86_64. */ + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); #endif return val; } @@ -182,11 +179,6 @@ static inline unsigned long __read_cr4(void) return native_read_cr4(); } -static inline unsigned long __read_cr4_safe(void) -{ - return native_read_cr4_safe(); -} - static inline void __write_cr4(unsigned long x) { native_write_cr4(x); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index be0a05913b91..921bea7a2708 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -20,187 +20,13 @@ * (the type definitions are in asm/spinlock_types.h) */ -#ifdef CONFIG_X86_32 -# define LOCK_PTR_REG "a" -#else -# define LOCK_PTR_REG "D" -#endif - -#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE)) -/* - * On PPro SMP, we use a locked operation to unlock - * (PPro errata 66, 92) - */ -# define UNLOCK_LOCK_PREFIX LOCK_PREFIX -#else -# define UNLOCK_LOCK_PREFIX -#endif - /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 << 15) extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); -#ifdef CONFIG_QUEUED_SPINLOCKS #include <asm/qspinlock.h> -#else - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - -static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) -{ - set_bit(0, (volatile unsigned long *)&lock->tickets.head); -} - -#else /* !CONFIG_PARAVIRT_SPINLOCKS */ -static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, - __ticket_t ticket) -{ -} -static inline void __ticket_unlock_kick(arch_spinlock_t *lock, - __ticket_t ticket) -{ -} - -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ -static inline int __tickets_equal(__ticket_t one, __ticket_t two) -{ - return !((one ^ two) & ~TICKET_SLOWPATH_FLAG); -} - -static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock, - __ticket_t head) -{ - if (head & TICKET_SLOWPATH_FLAG) { - arch_spinlock_t old, new; - - old.tickets.head = head; - new.tickets.head = head & ~TICKET_SLOWPATH_FLAG; - old.tickets.tail = new.tickets.head + TICKET_LOCK_INC; - new.tickets.tail = old.tickets.tail; - - /* try to clear slowpath flag when there are no contenders */ - cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); - } -} - -static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) -{ - return __tickets_equal(lock.tickets.head, lock.tickets.tail); -} - -/* - * Ticket locks are conceptually two parts, one indicating the current head of - * the queue, and the other indicating the current tail. The lock is acquired - * by atomically noting the tail and incrementing it by one (thus adding - * ourself to the queue and noting our position), then waiting until the head - * becomes equal to the the initial value of the tail. - * - * We use an xadd covering *both* parts of the lock, to increment the tail and - * also load the position of the head, which takes care of memory ordering - * issues and should be optimal for the uncontended case. Note the tail must be - * in the high part, because a wide xadd increment of the low part would carry - * up and contaminate the high part. - */ -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; - - inc = xadd(&lock->tickets, inc); - if (likely(inc.head == inc.tail)) - goto out; - - for (;;) { - unsigned count = SPIN_THRESHOLD; - - do { - inc.head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(inc.head, inc.tail)) - goto clear_slowpath; - cpu_relax(); - } while (--count); - __ticket_lock_spinning(lock, inc.tail); - } -clear_slowpath: - __ticket_check_and_clear_slowpath(lock, inc.head); -out: - barrier(); /* make sure nothing creeps before the lock is taken */ -} - -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) -{ - arch_spinlock_t old, new; - - old.tickets = READ_ONCE(lock->tickets); - if (!__tickets_equal(old.tickets.head, old.tickets.tail)) - return 0; - - new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); - new.head_tail &= ~TICKET_SLOWPATH_FLAG; - - /* cmpxchg is a full barrier, so nothing can move before it */ - return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; -} - -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - __ticket_t head; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - head = xadd(&lock->tickets.head, TICKET_LOCK_INC); - - if (unlikely(head & TICKET_SLOWPATH_FLAG)) { - head &= ~TICKET_SLOWPATH_FLAG; - __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC)); - } - } else - __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); -} - -static inline int arch_spin_is_locked(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = READ_ONCE(lock->tickets); - - return !__tickets_equal(tmp.tail, tmp.head); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = READ_ONCE(lock->tickets); - - tmp.head &= ~TICKET_SLOWPATH_FLAG; - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, - unsigned long flags) -{ - arch_spin_lock(lock); -} - -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - __ticket_t head = READ_ONCE(lock->tickets.head); - - for (;;) { - struct __raw_tickets tmp = READ_ONCE(lock->tickets); - /* - * We need to check "unlocked" in a loop, tmp.head == head - * can be false positive because of overflow. - */ - if (__tickets_equal(tmp.head, tmp.tail) || - !__tickets_equal(tmp.head, head)) - break; - - cpu_relax(); - } -} -#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 65c3e37f879a..25311ebb446c 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,20 +23,7 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) -#ifdef CONFIG_QUEUED_SPINLOCKS #include <asm-generic/qspinlock_types.h> -#else -typedef struct arch_spinlock { - union { - __ticketpair_t head_tail; - struct __raw_tickets { - __ticket_t head, tail; - } tickets; - }; -} arch_spinlock_t; - -#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } -#endif /* CONFIG_QUEUED_SPINLOCKS */ #include <asm-generic/qrwlock_types.h> diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 0944218af9e2..37f2e0b377ad 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -8,86 +8,86 @@ #include <linux/uaccess.h> #include <linux/ptrace.h> +#include <asm/switch_to.h> + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, +}; -extern int kstack_depth_to_print; - -struct thread_info; -struct stacktrace_ops; - -typedef unsigned long (*walk_stack_t)(struct task_struct *task, - unsigned long *stack, - unsigned long bp, - const struct stacktrace_ops *ops, - void *data, - unsigned long *end, - int *graph); - -extern unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); - -extern unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); - -/* Generic stack tracer with callbacks */ - -struct stacktrace_ops { - int (*address)(void *data, unsigned long address, int reliable); - /* On negative return stop dumping */ - int (*stack)(void *data, char *name); - walk_stack_t walk_stack; +struct stack_info { + enum stack_type type; + unsigned long *begin, *end, *next_sp; }; -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data); +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +void stack_type_str(enum stack_type type, const char **begin, + const char **end); + +static inline bool on_stack(struct stack_info *info, void *addr, size_t len) +{ + void *begin = info->begin; + void *end = info->end; + + return (info->type != STACK_TYPE_UNKNOWN && + addr >= begin && addr < end && + addr + len > begin && addr + len <= end); +} + +extern int kstack_depth_to_print; #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) #else #define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif #ifdef CONFIG_FRAME_POINTER -static inline unsigned long -stack_frame(struct task_struct *task, struct pt_regs *regs) +static inline unsigned long * +get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - unsigned long bp; - if (regs) - return regs->bp; + return (unsigned long *)regs->bp; - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - return bp; - } + if (task == current) + return __builtin_frame_address(0); - /* bp is the last reg pushed by switch_to */ - return *(unsigned long *)task->thread.sp; + return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; } #else -static inline unsigned long -stack_frame(struct task_struct *task, struct pt_regs *regs) +static inline unsigned long * +get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - return 0; + return NULL; +} +#endif /* CONFIG_FRAME_POINTER */ + +static inline unsigned long * +get_stack_pointer(struct task_struct *task, struct pt_regs *regs) +{ + if (regs) + return (unsigned long *)kernel_stack_pointer(regs); + + if (task == current) + return __builtin_frame_address(0); + + return (unsigned long *)task->thread.sp; } -#endif -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl); -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl); extern unsigned int code_bytes; @@ -106,7 +106,7 @@ static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; - get_bp(frame); + frame = __builtin_frame_address(0); #ifdef CONFIG_FRAME_POINTER frame = frame->next_frame; diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 90dbbd9666d4..a164862d77e3 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -2,6 +2,7 @@ #define _ASM_X86_STRING_64_H #ifdef __KERNEL__ +#include <linux/jump_label.h> /* Written 2002 by Andi Kleen */ @@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); +DECLARE_STATIC_KEY_FALSE(mcsafe_key); + /** * memcpy_mcsafe - copy memory with indication if a machine check happened * @@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct); * @cnt: number of bytes to copy * * Low level memory copy function that catches machine checks + * We only call into the "safe" function on systems that can + * actually do machine check recovery. Everyone else can just + * use memcpy(). * * Return 0 for success, -EFAULT for fail */ -int memcpy_mcsafe(void *dst, const void *src, size_t cnt); +static __always_inline __must_check int +memcpy_mcsafe(void *dst, const void *src, size_t cnt) +{ +#ifdef CONFIG_X86_MCE + if (static_branch_unlikely(&mcsafe_key)) + return memcpy_mcsafe_unrolled(dst, src, cnt); + else +#endif + memcpy(dst, src, cnt); + return 0; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8f321a1b03a1..5cb436acd463 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,130 +2,66 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ + +struct task_struct *__switch_to_asm(struct task_struct *prev, + struct task_struct *next); + __visible struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); -#ifdef CONFIG_X86_32 +/* This runs runs on the previous thread's stack. */ +static inline void prepare_switch_to(struct task_struct *prev, + struct task_struct *next) +{ +#ifdef CONFIG_VMAP_STACK + /* + * If we switch to a stack that has a top-level paging entry + * that is not present in the current mm, the resulting #PF will + * will be promoted to a double-fault and we'll panic. Probe + * the new stack now so that vmalloc_fault can fix up the page + * tables if needed. This can only happen if we use a stack + * in vmap space. + * + * We assume that the stack is aligned so that it never spans + * more than one top-level paging entry. + * + * To minimize cache pollution, just follow the stack pointer. + */ + READ_ONCE(*(unsigned char *)next->thread.sp); +#endif +} + +asmlinkage void ret_from_fork(void); + +/* data that is pointed to by thread.sp */ +struct inactive_task_frame { +#ifdef CONFIG_X86_64 + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; +#else + unsigned long si; + unsigned long di; +#endif + unsigned long bx; + unsigned long bp; + unsigned long ret_addr; +}; -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ +struct fork_frame { + struct inactive_task_frame frame; + struct pt_regs regs; +}; -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ #define switch_to(prev, next, last) \ do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ - asm volatile("pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ + prepare_switch_to(prev, next); \ \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ + ((last) = __switch_to_asm((prev), (next))); \ } while (0) -#else /* CONFIG_X86_32 */ - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15", "flags" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * There is no need to save or restore flags, because flags are always - * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL - * has no effect. - */ -#define switch_to(prev, next, last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) - -#endif /* CONFIG_X86_32 */ - #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4e23dd15c661..e3c95e8e61c5 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task, * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ - if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task_thread_info(task)->status & TS_COMPAT) + if (task->thread.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task, const unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task_thread_info(task)->status & TS_COMPAT) + if (task->thread.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -234,18 +234,8 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(void) { -#ifdef CONFIG_IA32_EMULATION - /* - * TS_COMPAT is set for 32-bit syscall entry and then - * remains set until we return to user mode. - * - * x32 tasks should be considered AUDIT_ARCH_X86_64. - */ - if (task_thread_info(current)->status & TS_COMPAT) - return AUDIT_ARCH_I386; -#endif - /* Both x32 and x86_64 are considered "64-bit". */ - return AUDIT_ARCH_X86_64; + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8b7c8d8e0852..2aaca53c0974 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -52,21 +52,6 @@ struct task_struct; #include <asm/cpufeature.h> #include <linux/atomic.h> -struct thread_info { - struct task_struct *task; /* main task structure */ - __u32 flags; /* low level flags */ - __u32 status; /* thread synchronous flags */ - __u32 cpu; /* current CPU */ -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .flags = 0, \ - .cpu = 0, \ -} - -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ @@ -95,7 +80,6 @@ struct thread_info { #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ -#define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -119,7 +103,6 @@ struct thread_info { #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) -#define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) @@ -160,11 +143,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); -} - static inline unsigned long current_stack_pointer(void) { unsigned long sp; @@ -226,60 +204,19 @@ static inline int arch_within_stack_frames(const void * const stack, # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) #endif -/* - * ASM operand which evaluates to a 'thread_info' address of - * the current task, if it is known that "reg" is exactly "off" - * bytes below the top of the stack currently. - * - * ( The kernel stack's size is known at build time, it is usually - * 2 or 4 pages, and the bottom of the kernel stack contains - * the thread_info structure. So to access the thread_info very - * quickly from assembly code we can calculate down from the - * top of the kernel stack to the bottom, using constant, - * build-time calculations only. ) - * - * For example, to fetch the current thread_info->flags value into %eax - * on x86-64 defconfig kernels, in syscall entry code where RSP is - * currently at exactly SIZEOF_PTREGS bytes away from the top of the - * stack: - * - * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax - * - * will translate to: - * - * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax - * - * which is below the current RSP by almost 16K. - */ -#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) - #endif -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ #endif - #ifndef __ASSEMBLY__ -static inline bool in_ia32_syscall(void) -{ #ifdef CONFIG_X86_32 - return true; -#endif -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & TS_COMPAT) - return true; +#define in_ia32_syscall() true +#else +#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \ + current->thread.status & TS_COMPAT) #endif - return false; -} /* * Force syscall return via IRET by making it look as if there was diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c3496619740a..01fd0a7f48cd 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs); extern void ist_begin_non_atomic(struct pt_regs *regs); extern void ist_end_non_atomic(void); +#ifdef CONFIG_VMAP_STACK +void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address); +#endif + /* Interrupts/Exceptions */ enum { X86_TRAP_DE = 0, /* 0, Divide-by-zero */ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h new file mode 100644 index 000000000000..c4b6d1cafa46 --- /dev/null +++ b/arch/x86/include/asm/unwind.h @@ -0,0 +1,73 @@ +#ifndef _ASM_X86_UNWIND_H +#define _ASM_X86_UNWIND_H + +#include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + int graph_idx; +#ifdef CONFIG_FRAME_POINTER + unsigned long *bp; +#else + unsigned long *sp; +#endif +}; + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame); + +bool unwind_next_frame(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + first_frame = first_frame ? : get_stack_pointer(task, regs); + + __unwind_start(state, task, regs, first_frame); +} + +#ifdef CONFIG_FRAME_POINTER + +static inline +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->bp + 1; +} + +unsigned long unwind_get_return_address(struct unwind_state *state); + +#else /* !CONFIG_FRAME_POINTER */ + +static inline +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + +static inline +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return ftrace_graph_ret_addr(state->task, &state->graph_idx, + *state->sp, state->sp); +} + +#endif /* CONFIG_FRAME_POINTER */ + +#endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 2184943341bf..69a6e07e3149 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -26,6 +26,8 @@ struct mce { __u32 socketid; /* CPU socket ID */ __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0503f5bfb18d..45257cf84370 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -125,6 +125,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +ifdef CONFIG_FRAME_POINTER +obj-y += unwind_frame.o +else +obj-y += unwind_guess.o +endif + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 3242e591fa82..26b78d86f25a 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 90d84c3eee53..32a7d70913ac 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; - cpu = generic_processor_info(id, ver); + cpu = __generic_processor_info(id, ver, enabled); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -282,6 +277,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; + acpi_table_print_madt_entry(header); + acpi_lapic_addr = lapic_addr_ovr->address; return 0; @@ -705,7 +702,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -716,6 +713,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) numa_set_node(cpu, nid); } #endif + return 0; } int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) @@ -998,21 +996,6 @@ static int __init acpi_parse_madt_lapic_entries(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; - /* - * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). - */ - - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); - if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); - return count; - } - - register_lapic_address(acpi_lapic_addr); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, acpi_parse_sapic, MAX_LOCAL_APIC); @@ -1031,8 +1014,8 @@ static int __init acpi_parse_madt_lapic_entries(void) return ret; } - x2count = madt_proc[0].count; - count = madt_proc[1].count; + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); @@ -1513,7 +1496,7 @@ void __init acpi_boot_table_init(void) * If acpi_disabled, bail out */ if (acpi_disabled) - return; + return; /* * Initialize the ACPI boot-time table parser. diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c new file mode 100644 index 000000000000..6fb478bf82fd --- /dev/null +++ b/arch/x86/kernel/acpi/cppc_msr.c @@ -0,0 +1,58 @@ +/* + * cppc_msr.c: MSR Interface for CPPC + * Copyright (c) 2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <acpi/cppc_acpi.h> +#include <asm/msr.h> + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index adb3eaf8fe2a..48587335ede8 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - stack_start = (unsigned long)temp_stack + sizeof(temp_stack); + initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f3e9b2df4b16..f266b8a92a9e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -1374,7 +1376,6 @@ void setup_local_APIC(void) * Actually disabling the focus CPU check just makes the hang less * frequent as it makes the interrupt distributon model be more * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro */ /* @@ -1816,8 +1817,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1828,17 +1828,14 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, mp_lapic_addr); + APIC_BASE, address); } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2027,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -int generic_processor_info(int apicid, int version) +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of + * nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +static int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "Only %d processors supported." + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids - 1, nr_logical_cpuids, apicid); + return -1; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2102,8 +2145,16 @@ int generic_processor_info(int apicid, int version) * for BSP. */ cpu = 0; - } else - cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* Logical cpuid 0 is reserved for BSP. */ + cpuid_to_apicid[0] = apicid; + } else { + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + } /* * This can happen on physical hotplug. The sanity check at boot time @@ -2120,8 +2171,6 @@ int generic_processor_info(int apicid, int version) return -ENOSPC; } - num_processors++; - /* * Validate version */ @@ -2130,14 +2179,12 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } - physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -2150,11 +2197,23 @@ int generic_processor_info(int apicid, int version) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - set_cpu_present(cpu, true); + + if (enabled) { + num_processors++; + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + } else { + disabled_cpus++; + } return cpu; } +int generic_processor_info(int apicid, int version) +{ + return __generic_processor_info(apicid, version, true); +} + int hard_smp_processor_id(void) { return read_apic_id(); @@ -2277,7 +2336,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5b2ae106bd4a..a4d7ff20ed22 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -25,7 +25,7 @@ static struct apic apic_physflat; static struct apic apic_flat; -struct apic __read_mostly *apic = &apic_flat; +struct apic *apic __ro_after_init = &apic_flat; EXPORT_SYMBOL_GPL(apic); static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector) static unsigned int flat_get_apic_id(unsigned long x) { - unsigned int id; - - id = (((x)>>24) & 0xFFu); - - return id; + return (x >> 24) & 0xFF; } static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xFFu)<<24); - return x; + return (id & 0xFF) << 24; } static unsigned int read_xapic_id(void) { - unsigned int id; - - id = flat_get_apic_id(apic_read(APIC_ID)); - return id; + return flat_get_apic_id(apic_read(APIC_ID)); } static int flat_apic_id_registered(void) @@ -154,7 +144,7 @@ static int flat_probe(void) return 1; } -static struct apic apic_flat = { +static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -248,7 +238,7 @@ static int physflat_probe(void) return 0; } -static struct apic apic_physflat = { +static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c05688b2deff..b109e4389c92 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } -struct apic apic_noop = { +struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, .acpi_madt_oem_check = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 714d4fda0d52..e08fe2c8dd8c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) static unsigned long numachip1_set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xffU) << 24); - return x; + return (id & 0xff) << 24; } static unsigned int numachip2_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 06dbaa458bfe..56012010332c 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,7 +142,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -static struct apic apic_bigsmp = { +static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7491f417a8e4..48e6d84f173e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void) * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + || APIC_XAPIC(boot_cpu_apic_version)) return; setup_ioapic_ids_from_mpc_nocheck(); } @@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); else return id; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ade25320df96..015bbf30e3e3 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } -static struct irq_chip hpet_msi_controller = { +static struct irq_chip hpet_msi_controller __ro_after_init = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7c43e716c158..c48264e202fd 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -72,7 +72,7 @@ static int probe_default(void) return 1; } -static struct apic apic_default = { +static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, @@ -126,7 +126,7 @@ static struct apic apic_default = { apic_driver(apic_default); -struct apic *apic = &apic_default; +struct apic *apic __ro_after_init = &apic_default; EXPORT_SYMBOL_GPL(apic); static int cmdline_apic __initdata; @@ -152,7 +152,7 @@ early_param("apic", parse_apic); void __init default_setup_apic_routing(void) { - int version = apic_version[boot_cpu_physical_apicid]; + int version = boot_cpu_apic_version; if (num_possible_cpus() > 8) { switch (boot_cpu_data.x86_vendor) { diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 54f35d988025..200af5ae9662 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -227,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } -static struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 4f13f54f1b1f..ff111f05a314 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -98,7 +98,7 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -static struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index cb0673c1e940..f034c84f74c3 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -533,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x) static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - /* maskout x2apic_extra_bits ? */ - x = id; - return x; + /* CHECKME: Do we need to mask out the xapic extra bits? */ + return id; } static unsigned int uv_read_apic_id(void) @@ -560,7 +557,7 @@ static int uv_probe(void) return apic == &apic_x2apic_uv_x; } -static struct apic __refdata apic_x2apic_uv_x = { +static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 2bd5c6ff7ee7..c62e015b126c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -29,10 +29,13 @@ void common(void) { BLANK(); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_CC_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif BLANK(); + OFFSET(TASK_TI_flags, task_struct, thread_info.flags); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ecdc1d217dc0..880aa093268d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -57,6 +57,11 @@ void foo(void) /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +#ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +#endif + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d875f97d4e0b..210927ee2e74 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -56,6 +56,11 @@ int main(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); +#ifdef CONFIG_CC_STACKPROTECTOR + DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + BLANK(); +#endif + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda03c527..9bd910a7dd0a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) - return; - - cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - c->cpu_index = 0; - filter_cpuid_features(c, false); + c->cpu_index = 0; + filter_cpuid_features(c, false); - if (this_cpu->c_bsp_init) - this_cpu->c_bsp_init(c); + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); @@ -1265,9 +1264,14 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; -struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) debug_idt_table }; +struct desc_ptr idt_descr __ro_after_init = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) idt_table, +}; +const struct desc_ptr debug_idt_descr = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1281,7 +1285,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; @@ -1305,11 +1309,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 27e46658ebe3..35691a6b0d32 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void) x86_hyper->x2apic_available && x86_hyper->x2apic_available(); } + +void hypervisor_pin_vcpu(int cpu) +{ + if (!x86_hyper) + return; + + if (x86_hyper->pin_vcpu) + x86_hyper->pin_vcpu(cpu); + else + WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d8ec849468..a7fdf453d895 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -41,6 +41,7 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> +#include <linux/jump_label.h> #include <asm/processor.h> #include <asm/traps.h> @@ -292,6 +293,13 @@ static void print_mce(struct mce *m) if (m->misc) pr_cont("MISC %llx ", m->misc); + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + pr_cont("\n"); /* * Note this output is parsed by external tools and old fields @@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(msr_ops.misc(i)); + if (m->status & MCI_STATUS_ADDRV) { m->addr = mce_rdmsrl(msr_ops.addr(i)); @@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i) m->addr >>= shift; m->addr <<= shift; } + + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); } } @@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; - /* - * MCG_CAP.MCG_SER_P is necessary but not sufficient to know - * whether this processor will actually generate recoverable - * machine checks. Check to see if this is an E7 model Xeon. - * We can't do a model number check because E5 and E7 use the - * same model number. E5 doesn't support recovery, E7 does. - */ - if (mca_cfg.recovery || (mca_cfg.ser && - !strncmp(c->x86_model_id, - "Intel(R) Xeon(R) CPU E7-", 24))) - set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank) * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable memcpy_mcsafe() */ static int __init mcheck_enable(char *str) { @@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void) static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif +DEFINE_STATIC_KEY_FALSE(mcsafe_key); +EXPORT_SYMBOL_GPL(mcsafe_key); + static int __init mcheck_late_init(void) { + if (mca_cfg.recovery) + static_branch_inc(&mcsafe_key); + mcheck_debugfs_init(); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 7b7f3be783d4..9b5403462936 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/string.h> #include <asm/amd_nb.h> #include <asm/apic.h> @@ -63,34 +64,71 @@ static const char * const th_names[] = { "execution_unit", }; -/* Define HWID to IP type mappings for Scalable MCA */ -struct amd_hwid amd_hwids[] = { - [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, - [SMCA_DF] = { "data_fabric", 0x2E }, - [SMCA_UMC] = { "umc", 0x96 }, - [SMCA_PB] = { "param_block", 0x5 }, - [SMCA_PSP] = { "psp", 0xFF }, - [SMCA_SMU] = { "smu", 0x1 }, +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" }; -EXPORT_SYMBOL_GPL(amd_hwids); - -const char * const amd_core_mcablock_names[] = { - [SMCA_LS] = "load_store", - [SMCA_IF] = "insn_fetch", - [SMCA_L2_CACHE] = "l2_cache", - [SMCA_DE] = "decode_unit", - [RES] = "", - [SMCA_EX] = "execution_unit", - [SMCA_FP] = "floating_point", - [SMCA_L3_CACHE] = "l3_cache", + +struct smca_bank_name smca_bank_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(amd_core_mcablock_names); +EXPORT_SYMBOL_GPL(smca_bank_names); + +static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype, xec_bitmap } */ + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, -const char * const amd_df_mcablock_names[] = { - [SMCA_CS] = "coherent_slave", - [SMCA_PIE] = "pie", + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + +struct smca_bank_info smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ @@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; * CPU Initialization */ +static void get_smca_bank_info(unsigned int bank) +{ + unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + struct smca_hwid_mcatype *type; + u32 high, instanceId; + u16 hwid, mcatype; + + /* Collect bank_info using CPU 0 for now. */ + if (cpu) + return; + + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid = high & MCI_IPID_HWID; + mcatype = (high & MCI_IPID_MCATYPE) >> 16; + hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + type = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == type->hwid_mcatype) { + smca_banks[bank].type = type; + smca_banks[bank].type_instance = instanceId; + break; + } + } +} + struct thresh_restart { struct threshold_block *b; int reset; @@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 get_block_address(u32 current_addr, u32 low, u32 high, +static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, */ u32 low, high; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) return addr; if (!(low & MCI_CONFIG_MCAX)) return addr; - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } @@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, */ smca_high &= ~BIT(2); + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) + smca_high |= BIT(5); + wrmsr(smca_addr, smca_low, smca_high); } @@ -421,12 +503,15 @@ out: void mce_amd_feature_init(struct cpuinfo_x86 *c) { u32 low = 0, high = 0, address = 0; - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (mce_flags.smca) + get_smca_bank_info(bank); + for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (threshold_err) m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) + if (m.status & MCI_STATUS_ADDRV) { rdmsrl(msr_addr, m.addr); + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + mce_log(&m); wrmsrl(msr_status, 0); @@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; - int cpu = smp_processor_id(); - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + unsigned int bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + if (!smca_banks[bank].type) + return NULL; + + bank_type = smca_banks[bank].type->bank_type; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_bank_names[bank_type].name, + smca_banks[bank].type_instance); + return buf_mcatype; +} + static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, unsigned int block, u32 address) { @@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - (bank == 4 ? bank4_names(b) : th_names[bank])); + get_name(bank, b)); if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(cpu, address, low, high, bank, ++block); if (!address) return 0; @@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = th_names[bank]; + const char *name = get_name(bank, NULL); int err = 0; if (is_shared_bank(bank)) { @@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); + err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); if (!err) goto out; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 28f1b54b7fad..24e87e74990d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(const struct mtrr_ops *ops) +void __init set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6c7ced07d16d..ad8bd763efa5 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index, bool get_mtrr_state(void); void mtrr_bp_pat_init(void); -extern void set_mtrr_ops(const struct mtrr_ops *ops); +extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 92e8f0a7159c..9b7cf5c28f5f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -17,7 +17,7 @@ #include <linux/sysfs.h> #include <asm/stacktrace.h> - +#include <asm/unwind.h> int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -25,11 +25,29 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, - void *data) + char *log_lvl) { + touch_nmi_watchdog(); printk("%s [<%p>] %s%pB\n", - (char *)data, (void *)address, reliable ? "" : "? ", + log_lvl, (void *)address, reliable ? "" : "? ", (void *)address); } @@ -38,176 +56,120 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) { - unsigned long ret_addr; - int index; - - if (addr != (unsigned long)return_to_handler) - return; - - index = task->curr_ret_stack; - - if (!task->ret_stack || index < *graph) - return; - - index -= *graph; - ret_addr = task->ret_stack[index].ret; - - ops->address(data, ret_addr, 1); + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; - (*graph)++; -} -#else -static inline void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) -{ } -#endif - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct task_struct *task, - void *p, unsigned int size, void *end) -{ - void *t = task_stack_page(task); - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p >= t && p < t + THREAD_SIZE - size; -} + printk("%sCall Trace:\n", log_lvl); -unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; + unwind_start(&state, task, regs, stack); /* - * If we overflowed the stack into a guard page, jump back to the - * bottom of the usable stack. + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * + * x86-32 can have up to three stacks: + * - task stack + * - softirq stack + * - hardirq stack */ - if ((unsigned long)task_stack_page(task) - (unsigned long)stack < - PAGE_SIZE) - stack = (unsigned long *)task_stack_page(task); - - while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, 0); - } - print_ftrace_graph_addr(addr, data, ops, task, graph); - } - stack++; - } - return bp; -} -EXPORT_SYMBOL_GPL(print_context_stack); - -unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *ret_addr = &frame->return_address; + for (; stack; stack = stack_info.next_sp) { + const char *str_begin, *str_end; - while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) { - unsigned long addr = *ret_addr; + /* + * If we overflowed the task stack into a guard page, jump back + * to the bottom of the usable stack. + */ + if (task_stack_page(task) - (void *)stack < PAGE_SIZE) + stack = task_stack_page(task); - if (!__kernel_text_address(addr)) + if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - if (ops->address(data, addr, 1)) - break; - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, task, graph); - } - - return (unsigned long)frame; -} -EXPORT_SYMBOL_GPL(print_context_stack_bp); - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static int print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_stack_address(addr, reliable, data); - return 0; -} - -static const struct stacktrace_ops print_trace_ops = { - .stack = print_trace_stack, - .address = print_trace_address, - .walk_stack = print_context_stack, -}; - -void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} + stack_type_str(stack_info.type, &str_begin, &str_end); + if (str_begin) + printk("%s <%s> ", log_lvl, str_begin); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = *stack; + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); + } -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); + if (str_end) + printk("%s <%s> ", log_lvl, str_end); + } } void show_stack(struct task_struct *task, unsigned long *sp) { - unsigned long bp = 0; - unsigned long stack; + task = task ? : current; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && (!task || task == current)) { - sp = &stack; - bp = stack_frame(current, NULL); - } + if (!sp && task == current) + sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, bp, ""); + show_stack_log_lvl(task, NULL, sp, ""); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); + show_stack_log_lvl(current, regs, NULL, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 09675712eba8..06eb322b5f9f 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,93 +16,121 @@ #include <asm/stacktrace.h> -static void *is_irq_stack(void *p, void *irq) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - if (p < irq || p >= (irq + THREAD_SIZE)) - return NULL; - return irq + THREAD_SIZE; + switch (type) { + case STACK_TYPE_IRQ: + case STACK_TYPE_SOFTIRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + default: + *begin = NULL; + *end = NULL; + } } - -static void *is_hardirq_stack(unsigned long *stack, int cpu) +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - void *irq = per_cpu(hardirq_stack, cpu); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - return is_irq_stack(stack, irq); -} + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; -static void *is_softirq_stack(unsigned long *stack, int cpu) -{ - void *irq = per_cpu(softirq_stack, cpu); + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; - return is_irq_stack(stack, irq); + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; } -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - const unsigned cpu = get_cpu(); - int graph = 0; - u32 *prev_esp; + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (!task) - task = current; + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; - if (!stack) { - unsigned long dummy; + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; - if (!bp) - bp = stack_frame(task, regs); + return true; +} - for (;;) { - void *end_stack; +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; - end_stack = is_hardirq_stack(stack, cpu); - if (!end_stack) - end_stack = is_softirq_stack(stack, cpu); + task = task ? : current; - bp = ops->walk_stack(task, stack, bp, ops, data, - end_stack, &graph); + if (in_task_stack(stack, task, info)) + goto recursion_check; - /* Stop if not on irq stack */ - if (!end_stack) - break; + if (task != current) + goto unknown; - /* The previous esp is saved on the bottom of the stack */ - prev_esp = (u32 *)(end_stack - THREAD_SIZE); - stack = (unsigned long *)*prev_esp; - if (!stack) - break; + if (in_hardirq_stack(stack, info)) + goto recursion_check; - if (ops->stack(data, "IRQ") < 0) - break; - touch_nmi_watchdog(); + if (in_softirq_stack(stack, info)) + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; } - put_cpu(); + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } -EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + if (!try_get_task_stack(task)) + return; + + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { @@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); + + put_task_stack(task); } @@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 9ee4520ce83c..36cf1a498227 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,261 +16,145 @@ #include <asm/stacktrace.h> +static char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ NMI_STACK-1 ] = "NMI", + [ DEBUG_STACK-1 ] = "#DB", + [ MCE_STACK-1 ] = "#MC", +}; -#define N_EXCEPTION_STACKS_END \ - (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) - -static char x86_stack_ids[][8] = { - [ DEBUG_STACK-1 ] = "#DB", - [ NMI_STACK-1 ] = "NMI", - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ MCE_STACK-1 ] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [ N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS_END ] = "#DB[?]" -#endif +static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = x86_stack_ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - x86_stack_ids[j][4] = '1' + - (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = x86_stack_ids[j]; - return (unsigned long *)end; - } -#endif + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + + switch (type) { + case STACK_TYPE_IRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; + *end = "EOE"; + break; + default: + *begin = NULL; + *end = NULL; } - return NULL; } -static inline int -in_irq_stack(unsigned long *stack, unsigned long *irq_stack, - unsigned long *irq_stack_end) +static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - return (stack >= irq_stack && stack < irq_stack_end); -} - -static const unsigned long irq_stack_size = - (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); - -enum stack_type { - STACK_IS_UNKNOWN, - STACK_IS_NORMAL, - STACK_IS_EXCEPTION, - STACK_IS_IRQ, -}; - -static enum stack_type -analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, - unsigned long **stack_end, unsigned long *irq_stack, - unsigned *used, char **id) -{ - unsigned long addr; + unsigned long *begin, *end; + struct pt_regs *regs; + unsigned k; - addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); - if ((unsigned long)task_stack_page(task) == addr) - return STACK_IS_NORMAL; + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - *stack_end = in_exception_stack(cpu, (unsigned long)stack, - used, id); - if (*stack_end) - return STACK_IS_EXCEPTION; + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - (exception_stack_sizes[k] / sizeof(long)); + regs = (struct pt_regs *)end - 1; - if (!irq_stack) - return STACK_IS_NORMAL; + if (stack < begin || stack >= end) + continue; - *stack_end = irq_stack; - irq_stack = irq_stack - irq_stack_size; + info->type = STACK_TYPE_EXCEPTION + k; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)regs->sp; - if (in_irq_stack(stack, irq_stack, *stack_end)) - return STACK_IS_IRQ; + return true; + } - return STACK_IS_UNKNOWN; + return false; } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - const unsigned cpu = get_cpu(); - unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned long dummy; - unsigned used = 0; - int graph = 0; - int done = 0; - - if (!task) - task = current; - - if (!stack) { - if (regs) - stack = (unsigned long *)regs->sp; - else if (task != current) - stack = (unsigned long *)task->thread.sp; - else - stack = &dummy; - } + unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); - if (!bp) - bp = stack_frame(task, regs); /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. */ - while (!done) { - unsigned long *stack_end; - enum stack_type stype; - char *id; + if (stack < begin || stack > end) + return false; - stype = analyze_stack(cpu, task, stack, &stack_end, - irq_stack, &used, &id); + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; - /* Default finish unless specified to continue */ - done = 1; + /* + * The next stack pointer is the first thing pushed by the entry code + * after switching to the irq stack. + */ + info->next_sp = (unsigned long *)*(end - 1); - switch (stype) { + return true; +} - /* Break out early if we are on the thread stack */ - case STACK_IS_NORMAL: - break; +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; - case STACK_IS_EXCEPTION: + task = task ? : current; - if (ops->stack(data, id) < 0) - break; + if (in_task_stack(stack, task, info)) + goto recursion_check; - bp = ops->walk_stack(task, stack, bp, ops, - data, stack_end, &graph); - ops->stack(data, "<EOE>"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) stack_end[-2]; - done = 0; - break; + if (task != current) + goto unknown; - case STACK_IS_IRQ: + if (in_exception_stack(stack, info)) + goto recursion_check; - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(task, stack, bp, - ops, data, stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (stack_end[-1]); - irq_stack = NULL; - ops->stack(data, "EOI"); - done = 0; - break; + if (in_irq_stack(stack, info)) + goto recursion_check; - case STACK_IS_UNKNOWN: - ops->stack(data, "UNK"); - break; - } - } + goto unknown; +recursion_check: /* - * This handles the process stack: + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. */ - bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); - put_cpu(); + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } -EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack; - int cpu; int i; - preempt_disable(); - cpu = smp_processor_id(); + if (!try_get_task_stack(task)) + return; - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); + irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - /* - * Debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu: - */ - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { @@ -299,18 +183,17 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack++; touch_nmi_watchdog(); } - preempt_enable(); pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); + + put_task_stack(task); } void show_regs(struct pt_regs *regs) { int i; - unsigned long sp; - sp = regs->sp; show_regs_print_info(KERN_DEFAULT); __show_regs(regs, 1); @@ -325,8 +208,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501f8935..b85fe5f91c3f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +166,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -348,7 +350,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type || current_type == E820_PRAM) { + if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -388,11 +390,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) while (nr_map) { u64 start = biosmap->addr; u64 size = biosmap->size; - u64 end = start + size; + u64 end = start + size - 1; u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) + if (start > end && likely(size)) return -1; e820_add_region(start, size, type); @@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820, start, size, old_type, new_type); + return __e820_update_range(e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820_saved, start, size, old_type, + return __e820_update_range(e820_saved, start, size, old_type, new_type); } @@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, e820_print_type(old_type); printk(KERN_CONT "\n"); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; u64 final_start, final_end; u64 ei_end; @@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map)) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) return; printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) { - sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), - &e820_saved.nr_map); + sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), + &e820_saved->nr_map); } #define MAX_GAP_END 0x100000000ull /* @@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr) { unsigned long long last; - int i = e820.nr_map; + int i = e820->nr_map; int found = 0; last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; + unsigned long long start = e820->map[i].addr; + unsigned long long end = start + e820->map[i].size; if (end < start_addr) continue; @@ -649,6 +651,33 @@ __init void e820_setup_gap(void) gapstart, gapstart + gapsize - 1); } +/* + * Called late during init, in free_initmem(). + * + * Initial e820 and e820_saved are largish __initdata arrays. + * Copy them to (usually much smaller) dynamically allocated area. + * This is done after all tweaks we ever do to them: + * all functions which modify them are __init functions, + * they won't exist after this point. + */ +__init void e820_reallocate_tables(void) +{ + struct e820map *n; + int size; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820, size); + e820 = n; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_saved, size); + e820_saved = n; +} + /** * Because of the size limitation of struct boot_params, only first * 128 E820 memory entries are passed to kernel via @@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); @@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) int i; unsigned long pfn = 0; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (pfn < PFN_UP(ei->addr)) register_nosave_region(pfn, PFN_UP(ei->addr)); @@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (ei->type == E820_NVS) acpi_nvs_register(ei->addr, ei->size); @@ -754,22 +783,18 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; unsigned long start_pfn; unsigned long end_pfn; - /* - * Persistent memory is accounted as ram for purposes of - * establishing max_pfn and mem_map. - */ - if (ei->type != E820_RAM && ei->type != E820_PRAM) + if (ei->type != type) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -794,15 +819,15 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN); + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32-PAGE_SHIFT)); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -856,7 +881,7 @@ static int __init parse_memmap_one(char *p) */ saved_max_pfn = e820_end_of_ram_pfn(); #endif - e820.nr_map = 0; + e820->nr_map = 0; userdef = 1; return 0; } @@ -903,8 +928,8 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), - &e820.nr_map) < 0) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), + &e820->nr_map) < 0) early_panic("Invalid user supplied memory map"); printk(KERN_INFO "e820: user-defined physical RAM map:\n"); @@ -912,7 +937,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +951,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +967,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +986,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -991,35 +1016,35 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); e820_res = res; - for (i = 0; i < e820.nr_map; i++) { - end = e820.map[i].addr + e820.map[i].size - 1; + for (i = 0; i < e820->nr_map; i++) { + end = e820->map[i].addr + e820->map[i].size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820.map[i].type); - res->start = e820.map[i].addr; + res->name = e820_type_to_string(e820->map[i].type); + res->start = e820->map[i].addr; res->end = end; - res->flags = e820_type_to_iomem_type(e820.map[i].type); - res->desc = e820_type_to_iores_desc(e820.map[i].type); + res->flags = e820_type_to_iomem_type(e820->map[i].type); + res->desc = e820_type_to_iores_desc(e820->map[i].type); /* * don't register the region that could be conflicted with * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (do_mark_busy(e820.map[i].type, res)) { + if (do_mark_busy(e820->map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; + for (i = 0; i < e820_saved->nr_map; i++) { + struct e820entry *entry = &e820_saved->map[i]; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry->type)); @@ -1027,7 +1052,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1051,7 +1076,7 @@ void __init e820_reserve_resources_late(void) struct resource *res; res = e820_res; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; @@ -1061,8 +1086,8 @@ void __init e820_reserve_resources_late(void) * Try to bump up RAM regions to reasonable boundaries to * avoid stolen RAM: */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *entry = &e820->map[i]; u64 start, end; if (entry->type != E820_RAM) @@ -1110,7 +1135,7 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820.nr_map = 0; + e820->nr_map = 0; e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } @@ -1124,7 +1149,7 @@ void __init setup_memory_map(void) char *who; who = x86_init.resources.memory_setup(); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } @@ -1141,8 +1166,8 @@ void __init memblock_x86_fill(void) */ memblock_allow_resize(); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; end = ei->addr + ei->size; if (end != (resource_size_t)end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index de7501edb21c..18bb3a639197 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -555,7 +555,7 @@ intel_graphics_stolen(int num, int slot, int func, /* Mark this space as reserved */ e820_add_region(base, size, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } static void __init intel_graphics_quirks(int num, int slot, int func) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 93982aebb398..2f2b8c7ccb85 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - current_thread_info()->status = 0; if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d036cfb4495d..8639bb2ae058 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { + frame_pointer, parent) == -EBUSY) { *parent = old; return; } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6f8902b0d151..5f401262f12d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * start_secondary(). */ ENTRY(start_cpu0) - movl stack_start, %ecx + movl initial_stack, %ecx movl %ecx, %esp jmp *(initial_code) ENDPROC(start_cpu0) @@ -307,7 +307,7 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp @@ -703,7 +703,7 @@ ENTRY(initial_page_table) .data .balign 4 -ENTRY(stack_start) +ENTRY(initial_stack) .long init_thread_union+THREAD_SIZE __INITRODATA diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9f8efc9f0075..c98a559c346e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,7 +66,7 @@ startup_64: */ /* - * Setup stack for verify_cpu(). "-8" because stack_start is defined + * Setup stack for verify_cpu(). "-8" because initial_stack is defined * this way, see below. Our best guess is a NULL ptr for stack * termination heuristics and we don't want to break anything which * might depend on it (kgdb, ...). @@ -226,7 +226,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip), %rsp + movq initial_stack(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -310,7 +310,7 @@ ENDPROC(secondary_startup_64) * start_secondary(). */ ENTRY(start_cpu0) - movq stack_start(%rip),%rsp + movq initial_stack(%rip),%rsp movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder pushq $__KERNEL_CS # set correct cs @@ -319,17 +319,15 @@ ENTRY(start_cpu0) ENDPROC(start_cpu0) #endif - /* SMP bootup changes these two */ + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - - GLOBAL(stack_start) + GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 - .word 0 __FINITDATA bad_address: diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4a7903714065..9ebd0b0e73d9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f2356bda2b05..3407b148c240 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved.nr_map; + nr_e820_entries = e820_saved->nr_map; /* TODO: Pass entries more than E820MAX in bootparams setup data */ if (nr_e820_entries > E820MAX) nr_e820_entries = E820MAX; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved.map, + memcpy(¶ms->e820_map, &e820_saved->map, nr_e820_entries * sizeof(struct e820entry)); return 0; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 04cde527d728..8e36f249646e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -50,6 +50,7 @@ #include <asm/apicdef.h> #include <asm/apic.h> #include <asm/nmi.h> +#include <asm/switch_to.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_DX] = 0; gdb_regs[GDB_SI] = 0; gdb_regs[GDB_DI] = 0; - gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; #ifdef CONFIG_X86_32 gdb_regs[GDB_DS] = __KERNEL_DS; gdb_regs[GDB_ES] = __KERNEL_DS; gdb_regs[GDB_PS] = 0; gdb_regs[GDB_CS] = __KERNEL_CS; - gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_PS] = 0; gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_R14] = 0; gdb_regs[GDB_R15] = 0; #endif + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_SP] = p->thread.sp; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 7847e5c0e0b5..28cee019209c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,7 +45,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4425f593f0ec..3bb4c5f021f6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -24,7 +24,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index c2bedaea11f7..4afc67f5facc 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -184,7 +184,7 @@ out: static struct kobj_attribute type_attr = __ATTR_RO(type); -static struct bin_attribute data_attr = { +static struct bin_attribute data_attr __ro_after_init = { .attr = { .name = "data", .mode = S_IRUGO, diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1726c4c12336..865058d087ac 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -575,9 +575,6 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } - -#ifdef CONFIG_QUEUED_SPINLOCKS - #include <asm/qspinlock.h> static void kvm_wait(u8 *ptr, u8 val) @@ -606,243 +603,6 @@ out: local_irq_restore(flags); } -#else /* !CONFIG_QUEUED_SPINLOCKS */ - -enum kvm_contention_stat { - TAKEN_SLOW, - TAKEN_SLOW_PICKUP, - RELEASED_SLOW, - RELEASED_SLOW_KICKED, - NR_CONTENTION_STATS -}; - -#ifdef CONFIG_KVM_DEBUG_FS -#define HISTO_BUCKETS 30 - -static struct kvm_spinlock_stats -{ - u32 contention_stats[NR_CONTENTION_STATS]; - u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_blocked; -} spinlock_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - u8 ret; - u8 old; - - old = READ_ONCE(zero_stats); - if (unlikely(old)) { - ret = cmpxchg(&zero_stats, old, 0); - /* This ensures only one fellow resets the stat */ - if (ret == old) - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - } -} - -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ - check_zero(); - spinlock_stats.contention_stats[var] += val; -} - - -static inline u64 spin_time_start(void) -{ - return sched_clock(); -} - -static void __spin_time_accum(u64 delta, u32 *array) -{ - unsigned index; - - index = ilog2(delta); - check_zero(); - - if (index < HISTO_BUCKETS) - array[index]++; - else - array[HISTO_BUCKETS]++; -} - -static inline void spin_time_accum_blocked(u64 start) -{ - u32 delta; - - delta = sched_clock() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); - spinlock_stats.time_blocked += delta; -} - -static struct dentry *d_spin_debug; -static struct dentry *d_kvm_debug; - -static struct dentry *kvm_init_debugfs(void) -{ - d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); - if (!d_kvm_debug) - printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); - - return d_kvm_debug; -} - -static int __init kvm_spinlock_debugfs(void) -{ - struct dentry *d_kvm; - - d_kvm = kvm_init_debugfs(); - if (d_kvm == NULL) - return -ENOMEM; - - d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); - - debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - - debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW]); - debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); - - debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW]); - debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - - debugfs_create_u64("time_blocked", 0444, d_spin_debug, - &spinlock_stats.time_blocked); - - debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); - - return 0; -} -fs_initcall(kvm_spinlock_debugfs); -#else /* !CONFIG_KVM_DEBUG_FS */ -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ -} - -static inline u64 spin_time_start(void) -{ - return 0; -} - -static inline void spin_time_accum_blocked(u64 start) -{ -} -#endif /* CONFIG_KVM_DEBUG_FS */ - -struct kvm_lock_waiting { - struct arch_spinlock *lock; - __ticket_t want; -}; - -/* cpus 'waiting' on a spinlock to become available */ -static cpumask_t waiting_cpus; - -/* Track spinlock on which a cpu is waiting */ -static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); - -__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) -{ - struct kvm_lock_waiting *w; - int cpu; - u64 start; - unsigned long flags; - __ticket_t head; - - if (in_nmi()) - return; - - w = this_cpu_ptr(&klock_waiting); - cpu = smp_processor_id(); - start = spin_time_start(); - - /* - * Make sure an interrupt handler can't upset things in a - * partially setup state. - */ - local_irq_save(flags); - - /* - * The ordering protocol on this is that the "lock" pointer - * may only be set non-NULL if the "want" ticket is correct. - * If we're updating "want", we must first clear "lock". - */ - w->lock = NULL; - smp_wmb(); - w->want = want; - smp_wmb(); - w->lock = lock; - - add_stats(TAKEN_SLOW, 1); - - /* - * This uses set_bit, which is atomic but we should not rely on its - * reordering gurantees. So barrier is needed after this call. - */ - cpumask_set_cpu(cpu, &waiting_cpus); - - barrier(); - - /* - * Mark entry to slowpath before doing the pickup test to make - * sure we don't deadlock with an unlocker. - */ - __ticket_enter_slowpath(lock); - - /* make sure enter_slowpath, which is atomic does not cross the read */ - smp_mb__after_atomic(); - - /* - * check again make sure it didn't become free while - * we weren't looking. - */ - head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(head, want)) { - add_stats(TAKEN_SLOW_PICKUP, 1); - goto out; - } - - /* - * halt until it's our turn and kicked. Note that we do safe halt - * for irq enabled case to avoid hang when lock info is overwritten - * in irq spinlock slowpath and no spurious interrupt occur to save us. - */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); - -out: - cpumask_clear_cpu(cpu, &waiting_cpus); - w->lock = NULL; - local_irq_restore(flags); - spin_time_accum_blocked(start); -} -PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); - -/* Kick vcpu waiting on @lock->head to reach value @ticket */ -static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) -{ - int cpu; - - add_stats(RELEASED_SLOW, 1); - for_each_cpu(cpu, &waiting_cpus) { - const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (READ_ONCE(w->lock) == lock && - READ_ONCE(w->want) == ticket) { - add_stats(RELEASED_SLOW_KICKED, 1); - kvm_kick_cpu(cpu); - break; - } - } -} - -#endif /* !CONFIG_QUEUED_SPINLOCKS */ - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -854,16 +614,11 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; -#ifdef CONFIG_QUEUED_SPINLOCKS __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; -#else /* !CONFIG_QUEUED_SPINLOCKS */ - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); - pv_lock_ops.unlock_kick = kvm_unlock_kick; -#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3692249a70f1..60b9949f1e65 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,7 +29,7 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -static int kvmclock = 1; +static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static cycle_t kvm_sched_clock_offset; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 068c4a929de6..0f8d20497383 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; + if (!smp_found_config) + return; + if (!mpf) return; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 1939a0269377..2c55a003b793 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,7 +8,6 @@ #include <asm/paravirt.h> -#ifdef CONFIG_QUEUED_SPINLOCKS __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); @@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void) return pv_lock_ops.queued_spin_unlock.func == __raw_callee_save___native_queued_spin_unlock; } -#endif struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP -#ifdef CONFIG_QUEUED_SPINLOCKS .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, -#else /* !CONFIG_QUEUED_SPINLOCKS */ - .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), - .unlock_kick = paravirt_nop, -#endif /* !CONFIG_QUEUED_SPINLOCKS */ #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1acfd76e3e26..bbf3d5933eaa 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -332,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, #ifdef CONFIG_X86_64 .read_cr8 = native_read_cr8, @@ -389,7 +388,7 @@ NOKPROBE_SYMBOL(native_load_idt); #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif -struct pv_mmu_ops pv_mmu_ops = { +struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 158dc0650d5d..920c6ae08592 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); #endif @@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index e70087a04cc8..bb3840cedb4f 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); #endif @@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 62c0b0ea2ce4..4002b475171c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> +#include <asm/switch_to.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -513,6 +514,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) } /* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + struct inactive_task_frame *frame = + (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); + return READ_ONCE_NOCHECK(frame->ret_addr); +} + +/* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack @@ -520,15 +532,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) */ unsigned long get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip; + unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + if (!try_get_task_stack(p)) + return 0; + start = (unsigned long)task_stack_page(p); if (!start) - return 0; + goto out; /* * Layout of the stack page: @@ -537,9 +552,7 @@ unsigned long get_wchan(struct task_struct *p) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start + * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: @@ -550,20 +563,25 @@ unsigned long get_wchan(struct task_struct *p) */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); + bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) - return 0; + goto out; - fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) - return 0; + goto out; ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; + if (!in_sched_functions(ip)) { + ret = ip; + goto out; + } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; + +out: + put_task_stack(p); + return ret; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d86be29c38c7..bd7be8efdc4c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,17 +55,6 @@ #include <asm/switch_to.h> #include <asm/vm86.h> -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *)tsk->thread.sp)[3]; -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -101,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = __read_cr4_safe(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); @@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); + struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); + struct inactive_task_frame *frame = &fork_frame->frame; struct task_struct *tsk; int err; - p->thread.sp = (unsigned long) childregs; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.ip = (unsigned long) ret_from_kernel_thread; - task_user_gs(p) = __KERNEL_STACK_CANARY; - childregs->ds = __USER_DS; - childregs->es = __USER_DS; - childregs->fs = __KERNEL_PERCPU; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->di = arg; p->thread.io_bitmap_ptr = NULL; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; - p->thread.ip = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 63236d8f84bf..de9acaf2d371 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -50,8 +50,6 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> -asmlinkage extern void ret_from_fork(void); - __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ @@ -141,12 +139,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, { int err; struct pt_regs *childregs; + struct fork_frame *fork_frame; + struct inactive_task_frame *frame; struct task_struct *me = current; p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); - p->thread.sp = (unsigned long) childregs; - set_tsk_thread_flag(p, TIF_FORK); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); @@ -160,15 +163,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->sp = (unsigned long)childregs; - childregs->ss = __KERNEL_DS; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->r12 = arg; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; @@ -511,7 +510,7 @@ void set_personality_ia32(bool x32) current->personality &= ~READ_IMPLIES_EXEC; /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ - current_thread_info()->status &= ~TS_COMPAT; + current->thread.status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); @@ -519,7 +518,7 @@ void set_personality_ia32(bool x32) current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; } } EXPORT_SYMBOL_GPL(set_personality_ia32); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a541ff..ce94c38cf4d6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return sp; prev_esp = (u32 *)(context); - if (prev_esp) - return (unsigned long)prev_esp; + if (*prev_esp) + return (unsigned long)*prev_esp; return (unsigned long)regs; } @@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_I386_REGS_POKED; + child->thread.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): @@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static struct user_regset x86_64_regsets[] __read_mostly = { +static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static struct user_regset x86_32_regsets[] __read_mostly = { +static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = { */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index cc457ff818ad..51402a7e4ca6 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); #endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#include <linux/jump_label.h> +#include <asm/string_64.h> + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + static_branch_inc(&mcsafe_key); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if ((capid0 & 0xc0) == 0xc0) + static_branch_inc(&mcsafe_key); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 63bf27d972b7..e244c19a2451 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -705,7 +705,7 @@ static void native_machine_power_off(void) tboot_shutdown(TB_SHUTDOWN_HALT); } -struct machine_ops machine_ops = { +struct machine_ops machine_ops __ro_after_init = { .power_off = native_machine_power_off, .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 80eab01c1a68..2408c1603438 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail) int i; struct e820entry *entry; - for (i = 0; i < e820.nr_map; i++) { - entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + entry = &e820->map[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa60f5f5a16..bbfbca5fea0c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -__visible unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features __ro_after_init; #else -__visible unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -458,8 +458,8 @@ static void __init e820_reserve_setup_data(void) early_memunmap(data, sizeof(*data)); } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } @@ -763,7 +763,7 @@ static void __init trim_bios_range(void) */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } /* called before trim_bios_range() to spare extra sanitize */ @@ -1032,7 +1032,7 @@ void __init setup_arch(char **cmdline_p) if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); printk(KERN_INFO "fixed physical RAM map:\n"); e820_print_map("bad_ppro"); } @@ -1096,19 +1096,19 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) { + reserve_bios_regions(); + + if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); efi_find_mirror(); - } - - reserve_bios_regions(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be called - * after ExitBootServices(). This is, in fact, a lie. - */ - if (efi_enabled(EFI_MEMMAP)) + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ efi_reserve_boot_services(); + } /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); @@ -1137,9 +1137,7 @@ void __init setup_arch(char **cmdline_p) * auditing all the early-boot CR4 manipulation would be needed to * rule it out. */ - if (boot_cpu_data.cpuid_level >= 0) - /* A CPU has %cr4 if and only if it has CPUID. */ - mmu_cr4_features = __read_cr4(); + mmu_cr4_features = __read_cr4(); memblock_set_current_limit(get_max_mapped()); @@ -1221,8 +1219,7 @@ void __init setup_arch(char **cmdline_p) /* * get boot-time SMP configuration: */ - if (smp_found_config) - get_smp_config(); + get_smp_config(); prefill_possible_map(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7a40e068302d..2bbd27f89802 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; EXPORT_SYMBOL(__per_cpu_offset); @@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE - 64; + IRQ_STACK_SIZE; #endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 04cb3212db2d..da20ecb5397a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -783,7 +783,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * than the tracee. */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4296beb8fdd3..7249dcf2cbcb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static struct sched_domain_topology_level numa_inside_package_topology[] = { +static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif @@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = { #endif { NULL, }, }; + +static struct sched_domain_topology_level x86_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + /* - * set_sched_topology() sets the topology internal to a CPU. The - * NUMA topologies are layered on top of it to build the full - * system topology. - * - * If NUMA nodes are observed to occur within a CPU package, this - * function should be called. It forces the sched domain code to - * only use the SMT level for the CPU portion of the topology. - * This essentially falls back to relying on NUMA information - * from the SRAT table to describe the entire system topology - * (except for hyperthreads). + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours and Intel Cluster-on-Die have this. */ -static void primarily_use_numa_for_topology(void) -{ - set_sched_topology(numa_inside_package_topology); -} +static bool x86_has_numa_in_package; void set_cpu_sibling_map(int cpu) { @@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu) c->booted_cores = cpu_data(i).booted_cores; } if (match_die(c, o) && !topology_same_node(c, o)) - primarily_use_numa_for_topology(); + x86_has_numa_in_package = true; } threads = cpumask_weight(topology_sibling_cpumask(cpu)); @@ -690,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -717,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); @@ -756,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) + if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; @@ -942,7 +943,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) per_cpu(cpu_current_top_of_stack, cpu) = (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else - clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif } @@ -969,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = idle->thread.sp; + initial_stack = idle->thread.sp; /* * Enable the espfix hack for this CPU @@ -994,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -1249,7 +1249,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + if (APIC_INTEGRATED(boot_cpu_apic_version) && !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", @@ -1304,6 +1304,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } + + /* + * Set 'default' x86 topology, this matches default_topology() in that + * it has NUMA nodes as a topology level. See also + * native_smp_cpus_done(). + * + * Must be done before set_cpus_sibling_map() is ran. + */ + set_sched_topology(x86_topology); + set_cpu_sibling_map(0); switch (smp_sanity_check(max_cpus)) { @@ -1323,14 +1333,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) break; } - default_setup_apic_routing(); - if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } + default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); pr_info("CPU%d: ", 0); @@ -1370,6 +1379,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); + if (x86_has_numa_in_package) + set_sched_topology(x86_numa_in_package_topology); + nmi_selftest(); impress_friends(); setup_ioapic_dest(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4738f5e0f2ab..0653788026e2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -8,80 +8,69 @@ #include <linux/export.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> +#include <asm/unwind.h> -static int save_stack_stack(void *data, char *name) +static int save_stack_address(struct stack_trace *trace, unsigned long addr, + bool nosched) { - return 0; -} - -static int -__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) -{ - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return 0; -#endif if (nosched && in_sched_functions(addr)) return 0; + if (trace->skip > 0) { trace->skip--; return 0; } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = addr; - return 0; - } else { - return -1; /* no more room, stop walking the stack */ - } -} -static int save_stack_address(void *data, unsigned long addr, int reliable) -{ - return __save_stack_address(data, addr, reliable, false); + if (trace->nr_entries >= trace->max_entries) + return -1; + + trace->entries[trace->nr_entries++] = addr; + return 0; } -static int -save_stack_address_nosched(void *data, unsigned long addr, int reliable) +static void __save_stack_trace(struct stack_trace *trace, + struct task_struct *task, struct pt_regs *regs, + bool nosched) { - return __save_stack_address(data, addr, reliable, true); -} + struct unwind_state state; + unsigned long addr; -static const struct stacktrace_ops save_stack_ops = { - .stack = save_stack_stack, - .address = save_stack_address, - .walk_stack = print_context_stack, -}; + if (regs) + save_stack_address(trace, regs->ip, nosched); -static const struct stacktrace_ops save_stack_ops_nosched = { - .stack = save_stack_stack, - .address = save_stack_address_nosched, - .walk_stack = print_context_stack, -}; + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || save_stack_address(trace, addr, nosched)) + break; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, regs, false); } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (!try_get_task_stack(tsk)) + return; + + __save_stack_trace(trace, tsk, NULL, true); + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 654f6c66fe45..8402907825b0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -188,12 +188,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820.nr_map; i++) { - if ((e820.map[i].type != E820_RAM) - && (e820.map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820->nr_map; i++) { + if ((e820->map[i].type != E820_RAM) + && (e820->map[i].type != E820_RESERVED_KERN)) continue; - add_mac_region(e820.map[i].addr, e820.map[i].size); + add_mac_region(e820->map[i].addr, e820->map[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b70ca12dd389..bd4e3d4d3625 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +#ifdef CONFIG_VMAP_STACK +__visible void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) +{ + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", + (void *)fault_address, current->stack, + (char *)current->stack + THREAD_SIZE - 1); + die(message, regs, 0); + + /* Be absolutely certain we don't return. */ + panic(message); +} +#endif + #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long cr2; +#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; +#ifdef CONFIG_VMAP_STACK + /* + * If we overflow the stack into a guard page, the CPU will fail + * to deliver #PF and will send #DF instead. Similarly, if we + * take any non-IST exception while too close to the bottom of + * the stack, the processor will get a page fault while + * delivering the exception and will generate a double fault. + * + * According to the SDM (footnote in 6.15 under "Interrupt 14 - + * Page-Fault Exception (#PF): + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being + * deliv- ered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a + * double fault. + * + * The logic below has a small possibility of incorrectly diagnosing + * some errors as stack overflows. For example, if the IDT or GDT + * gets corrupted such that #GP delivery fails due to a bad descriptor + * causing #GP and we hit this condition while CR2 coincidentally + * points to the stack guard page, we'll think we overflowed the + * stack. Given that we're going to panic one way or another + * if this happens, this isn't necessarily worth fixing. + * + * If necessary, we could improve the test by only diagnosing + * a stack overflow if the saved RSP points within 47 bytes of + * the bottom of the stack: if RSP == tsk_stack + 48 and we + * take an exception, the stack is already aligned and there + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a + * possible error code, so a stack overflow would *not* double + * fault. With any less space left, exception delivery could + * fail, and, as a practical matter, we've overflowed the + * stack even if the actual trigger for the double fault was + * something else. + */ + cr2 = read_cr2(); + if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) + handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); +#endif + #ifdef CONFIG_DOUBLEFAULT df_debug(regs, error_code); #endif diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c new file mode 100644 index 000000000000..a2456d4d286a --- /dev/null +++ b/arch/x86/kernel/unwind_frame.c @@ -0,0 +1,93 @@ +#include <linux/sched.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +#define FRAME_HEADER_SIZE (sizeof(long) * 2) + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + unsigned long addr; + unsigned long *addr_p = unwind_get_return_address_ptr(state); + + if (unwind_done(state)) + return 0; + + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr_p); + + return __kernel_text_address(addr) ? addr : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool update_stack_state(struct unwind_state *state, void *addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If addr isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that 'info->next_sp' could point to an empty stack and 'addr' could + * be on a subsequent stack. + */ + while (!on_stack(info, addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long *next_bp; + + if (unwind_done(state)) + return false; + + next_bp = (unsigned long *)*state->bp; + + /* make sure the next frame's data is accessible */ + if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) + return false; + + /* move to the next frame */ + state->bp = next_bp; + return true; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* don't even attempt to start from user mode regs */ + if (regs && user_mode(regs)) { + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; + } + + /* set up the starting stack frame */ + state->bp = get_frame_pointer(task, regs); + + /* initialize stack info and make sure the frame data is accessible */ + get_stack_info(state->bp, state->task, &state->stack_info, + &state->stack_mask); + update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->bp < first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c new file mode 100644 index 000000000000..b5a834c93065 --- /dev/null +++ b/arch/x86/kernel/unwind_guess.c @@ -0,0 +1,43 @@ +#include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + do { + for (state->sp++; state->sp < info->end; state->sp++) + if (__kernel_text_address(*state->sp)) + return true; + + state->sp = info->next_sp; + + } while (!get_stack_info(state->sp, state->task, info, + &state->stack_mask)); + + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + + state->task = task; + state->sp = first_frame; + + get_stack_info(first_frame, state->task, &state->stack_info, + &state->stack_mask); + + if (!__kernel_text_address(*first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 95e49f6e4fc3..b2cee3d19477 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -EXPORT_SYMBOL_GPL(memcpy_mcsafe); +EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 76c5e52436c4..0bd9f1287f39 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; -struct x86_platform_ops x86_platform = { +struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, @@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = { EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi = { +struct x86_msi_ops x86_msi __ro_after_init = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, @@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev) } #endif -struct x86_io_apic_ops x86_io_apic_ops = { +struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = { .read = native_io_apic_read, .disable = native_disable_io_apic, }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af523d84d102..1e6b84b96ea6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4961,7 +4961,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static struct kvm_x86_ops svm_x86_ops = { +static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5cede40e2552..121fdf6e9ed0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11177,7 +11177,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } -static struct kvm_x86_ops vmx_x86_ops = { +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .hardware_setup = hardware_setup, diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 2ec0b0abbfaa..49e6ebac7e73 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -181,11 +181,11 @@ ENDPROC(memcpy_orig) #ifndef CONFIG_UML /* - * memcpy_mcsafe - memory copy with machine check exception handling + * memcpy_mcsafe_unrolled - memory copy with machine check exception handling * Note that we only catch machine checks when reading the source addresses. * Writes to target are posted and don't generate machine checks. */ -ENTRY(memcpy_mcsafe) +ENTRY(memcpy_mcsafe_unrolled) cmpl $8, %edx /* Less than 8 bytes? Go to byte copy loop */ jb .L_no_whole_words @@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe) .L_done_memcpy_trap: xorq %rax, %rax ret -ENDPROC(memcpy_mcsafe) +ENDPROC(memcpy_mcsafe_unrolled) .section .fixup, "ax" /* Return -EFAULT for any failure */ diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index ba47524f56e8..d1c7de095808 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -52,21 +52,6 @@ static __init int find_northbridge(void) return -ENOENT; } -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - int __init amd_numa_init(void) { u64 start = PFN_PHYS(0); @@ -180,8 +165,11 @@ int __init amd_numa_init(void) cores = 1 << bits; apicid_base = 0; - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); apicid_base = boot_cpu_physical_apicid; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 832b98f822be..79ae939970d3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/extable.h> #include <asm/uaccess.h> #include <asm/traps.h> #include <asm/kdebug.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc8023060456..1e525122cbe4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -5,7 +5,7 @@ */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ -#include <linux/module.h> /* search_exception_table */ +#include <linux/extable.h> /* search_exception_table */ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ @@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; } +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + register void *__sp asm("rsp"); + unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : "+r" (__sp) + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + /* * 32-bit: * diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d28a2d741f9e..22af912d66d2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -699,8 +699,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } -void free_initmem(void) +void __ref free_initmem(void) { + e820_reallocate_tables(); + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index bda8d5eef04d..ddd2661c4502 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -40,17 +40,26 @@ * You need to add an if/def entry if you introduce a new memory region * compatible with KASLR. Your entry must be in logical order with memory * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to * ensure that this order is correct and won't be changed. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMEMMAP_START; + +#if defined(CONFIG_X86_ESPFIX64) +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +#elif defined(CONFIG_EFI) +static const unsigned long vaddr_end = EFI_VA_START; +#else +static const unsigned long vaddr_end = __START_KERNEL_map; +#endif /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base = __VMALLOC_BASE; EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 64/* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, + { &vmemmap_base, 1 }, }; /* Get size in bytes used by the memory region */ @@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void) struct rnd_state rand_state; unsigned long remain_entropy; + /* + * All these BUILD_BUG_ON checks ensures the memory layout is + * consistent with the vaddr_start/vaddr_end variables. + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + vaddr_end >= EFI_VA_START); + BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || + config_enabled(CONFIG_EFI)) && + vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + if (!kaslr_memory_enabled()) return; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fb682108f4dc..3f35b48d1d9d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,22 +722,19 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static __init int find_near_online_node(int node) +static void __init init_memory_less_node(int nid) { - int n, val; - int min_val = INT_MAX; - int best_node = -1; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; - for_each_online_node(n) { - val = node_distance(node, n); + /* Allocate and initialize node data. Memory-less node is now online.*/ + alloc_node_data(nid); + free_area_init_node(nid, zones_size, 0, zholes_size); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - return best_node; + /* + * All zonelists will be built later in start_kernel() after per cpu + * areas are initialized. + */ } /* @@ -766,8 +763,10 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + if (!node_online(node)) - node = find_near_online_node(node); + init_memory_less_node(node); + numa_set_node(cpu, node); } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09fa4f0..e3353c97d086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa, } } -static int populate_pmd(struct cpa_data *cpa, - unsigned long start, unsigned long end, - unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) { - unsigned int cur_pages = 0; + long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa, return num_pages; } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) { pud_t *pud; unsigned long end; - int cur_pages = 0; + long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* Map trailing leftover */ if (start < end) { - int tmp; + long tmp; pud = pud_offset(pgd, start); if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ pgd_t *pgd_entry; - int ret; + long ret; pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { - int ret, numpages = cpa->numpages; + unsigned long numpages = cpa->numpages; + int ret; while (numpages) { /* diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index de391b7bc19a..159b52ccd600 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) struct memtype *rbt_memtype_lookup(u64 addr) { - struct memtype *data; - data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); - return data; + return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); } #if defined(CONFIG_DEBUG_FS) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4dbe65622810..a7655f6caf7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + + pgd_t *pgd = next->pgd + stack_pgd_index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } + #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif + cpumask_set_cpu(cpu, mm_cpumask(next)); /* diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index cb31a4440e58..a2488b6e27d6 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -16,27 +16,7 @@ #include <asm/ptrace.h> #include <asm/stacktrace.h> - -static int backtrace_stack(void *data, char *name) -{ - /* Yes, we want all stacks */ - return 0; -} - -static int backtrace_address(void *data, unsigned long addr, int reliable) -{ - unsigned int *depth = data; - - if ((*depth)--) - oprofile_add_trace(addr); - return 0; -} - -static struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; +#include <asm/unwind.h> #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * @@ -113,10 +93,29 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode(regs)) { - unsigned long stack = kernel_stack_pointer(regs); - if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, - &backtrace_ops, &depth); + struct unwind_state state; + unsigned long addr; + + if (!depth) + return; + + oprofile_add_trace(regs->ip); + + if (!--depth) + return; + + for (unwind_start(&state, current, regs, NULL); + !unwind_done(&state); unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + + oprofile_add_trace(addr); + + if (!--depth) + break; + } + return; } diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 9770e55e768f..1d97cea3b3a4 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service) static struct { unsigned long address; unsigned short segment; -} pci_indirect = { 0, __KERNEL_CS }; +} pci_indirect __ro_after_init = { + .address = 0, + .segment = __KERNEL_CS, +}; -static int pci_bios_present; +static int pci_bios_present __ro_after_init; static int __init check_pcibios(void) { diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 6a2f5691b1ab..6aad870e8962 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -82,21 +82,12 @@ void __init efi_bgrt_init(void) } bgrt_image_size = bmp_header.size; - bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); + bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); if (!bgrt_image) { - pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", - bgrt_image_size); - return; - } - - image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); - if (!image) { pr_notice("Ignoring BGRT: failed to map image memory\n"); - kfree(bgrt_image); bgrt_image = NULL; return; } - memcpy(bgrt_image, image, bgrt_image_size); - memunmap(image); + efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1fbb408e2e72..bf99aa7005eb 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -166,13 +166,15 @@ static void __init do_add_efi_memmap(void) } e820_add_region(start, size, e820_type); } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } int __init efi_memblock_x86_reserve_range(void) { struct efi_info *e = &boot_params.efi_info; + struct efi_memory_map_data data; phys_addr_t pmap; + int rv; if (efi_enabled(EFI_PARAVIRT)) return 0; @@ -187,11 +189,17 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - efi.memmap.phys_map = pmap; - efi.memmap.nr_map = e->efi_memmap_size / - e->efi_memdesc_size; - efi.memmap.desc_size = e->efi_memdesc_size; - efi.memmap.desc_version = e->efi_memdesc_version; + data.phys_map = pmap; + data.size = e->efi_memmap_size; + data.desc_size = e->efi_memdesc_size; + data.desc_version = e->efi_memdesc_version; + + rv = efi_memmap_init_early(&data); + if (rv) + return rv; + + if (add_efi_memmap) + do_add_efi_memmap(); WARN(efi.memmap.desc_version != 1, "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", @@ -218,19 +226,6 @@ void __init efi_print_memmap(void) } } -void __init efi_unmap_memmap(void) -{ - unsigned long size; - - clear_bit(EFI_MEMMAP, &efi.flags); - - size = efi.memmap.nr_map * efi.memmap.desc_size; - if (efi.memmap.map) { - early_memunmap(efi.memmap.map, size); - efi.memmap.map = NULL; - } -} - static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { @@ -414,33 +409,6 @@ static int __init efi_runtime_init(void) return 0; } -static int __init efi_memmap_init(void) -{ - unsigned long addr, size; - - if (efi_enabled(EFI_PARAVIRT)) - return 0; - - /* Map the EFI memory map */ - size = efi.memmap.nr_map * efi.memmap.desc_size; - addr = (unsigned long)efi.memmap.phys_map; - - efi.memmap.map = early_memremap(addr, size); - if (efi.memmap.map == NULL) { - pr_err("Could not map the memory map!\n"); - return -ENOMEM; - } - - efi.memmap.map_end = efi.memmap.map + size; - - if (add_efi_memmap) - do_add_efi_memmap(); - - set_bit(EFI_MEMMAP, &efi.flags); - - return 0; -} - void __init efi_init(void) { efi_char16_t *c16; @@ -498,16 +466,14 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (efi_runtime_disabled() || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) { + efi_memmap_unmap(); return; + } } - if (efi_memmap_init()) - return; if (efi_enabled(EFI_DBG)) efi_print_memmap(); - - efi_esrt_init(); } void __init efi_late_init(void) @@ -624,42 +590,6 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) } } -static void __init save_runtime_map(void) -{ -#ifdef CONFIG_KEXEC_CORE - unsigned long desc_size; - efi_memory_desc_t *md; - void *tmp, *q = NULL; - int count = 0; - - if (efi_enabled(EFI_OLD_MEMMAP)) - return; - - desc_size = efi.memmap.desc_size; - - for_each_efi_memory_desc(md) { - if (!(md->attribute & EFI_MEMORY_RUNTIME) || - (md->type == EFI_BOOT_SERVICES_CODE) || - (md->type == EFI_BOOT_SERVICES_DATA)) - continue; - tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); - if (!tmp) - goto out; - q = tmp; - - memcpy(q + count * desc_size, md, desc_size); - count++; - } - - efi_runtime_map_setup(q, count, desc_size); - return; - -out: - kfree(q); - pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n"); -#endif -} - static void *realloc_pages(void *old_memmap, int old_shift) { void *ret; @@ -745,6 +675,46 @@ static void *efi_map_next_entry(void *entry) return entry; } +static bool should_map_region(efi_memory_desc_t *md) +{ + /* + * Runtime regions always require runtime mappings (obviously). + */ + if (md->attribute & EFI_MEMORY_RUNTIME) + return true; + + /* + * 32-bit EFI doesn't suffer from the bug that requires us to + * reserve boot services regions, and mixed mode support + * doesn't exist for 32-bit kernels. + */ + if (IS_ENABLED(CONFIG_X86_32)) + return false; + + /* + * Map all of RAM so that we can access arguments in the 1:1 + * mapping when making EFI runtime calls. + */ + if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) { + if (md->type == EFI_CONVENTIONAL_MEMORY || + md->type == EFI_LOADER_DATA || + md->type == EFI_LOADER_CODE) + return true; + } + + /* + * Map boot services regions as a workaround for buggy + * firmware that accesses them even when they shouldn't. + * + * See efi_{reserve,free}_boot_services(). + */ + if (md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA) + return true; + + return false; +} + /* * Map the efi memory ranges of the runtime services and update new_mmap with * virtual addresses. @@ -761,13 +731,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) p = NULL; while ((p = efi_map_next_entry(p))) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) { -#ifdef CONFIG_X86_64 - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) -#endif - continue; - } + + if (!should_map_region(md)) + continue; efi_map_region(md); get_systab_virt_addr(md); @@ -803,7 +769,7 @@ static void __init kexec_enter_virtual_mode(void) * non-native EFI */ if (!efi_is_native()) { - efi_unmap_memmap(); + efi_memmap_unmap(); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -823,7 +789,18 @@ static void __init kexec_enter_virtual_mode(void) get_systab_virt_addr(md); } - save_runtime_map(); + /* + * Unregister the early EFI memmap from efi_init() and install + * the new EFI memory map. + */ + efi_memmap_unmap(); + + if (efi_memmap_init_late(efi.memmap.phys_map, + efi.memmap.desc_size * efi.memmap.nr_map)) { + pr_err("Failed to remap late EFI memory map\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } BUG_ON(!efi.systab); @@ -884,6 +861,7 @@ static void __init __efi_enter_virtual_mode(void) int count = 0, pg_shift = 0; void *new_memmap = NULL; efi_status_t status; + phys_addr_t pa; efi.systab = NULL; @@ -901,11 +879,24 @@ static void __init __efi_enter_virtual_mode(void) return; } - save_runtime_map(); + pa = __pa(new_memmap); + + /* + * Unregister the early EFI memmap from efi_init() and install + * the new EFI memory map that we are about to pass to the + * firmware via SetVirtualAddressMap(). + */ + efi_memmap_unmap(); + + if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) { + pr_err("Failed to remap late EFI memory map\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } BUG_ON(!efi.systab); - if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) { + if (efi_setup_page_tables(pa, 1 << pg_shift)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -917,14 +908,14 @@ static void __init __efi_enter_virtual_mode(void) efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + (efi_memory_desc_t *)pa); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + (efi_memory_desc_t *)pa); } if (status != EFI_SUCCESS) { @@ -956,15 +947,6 @@ static void __init __efi_enter_virtual_mode(void) efi_runtime_update_mappings(); efi_dump_pagetable(); - /* - * We mapped the descriptor array into the EFI pagetable above - * but we're not unmapping it here because if we're running in - * EFI mixed mode we need all of memory to be accessible when - * we pass parameters to the EFI runtime services in the - * thunking code. - */ - free_pages((unsigned long)new_memmap, pg_shift); - /* clean DUMMY object */ efi_delete_dummy_variable(); } diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 677e29e29473..58b0f801f66f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -85,7 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void) early_code_mapping_set_exec(1); n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); - save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); + save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); for (pgd = 0; pgd < n_pgds; pgd++) { save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); @@ -214,7 +214,6 @@ void efi_sync_low_kernel_mappings(void) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long pfn, text; - efi_memory_desc_t *md; struct page *page; unsigned npages; pgd_t *pgd; @@ -245,28 +244,9 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * text and allocate a new stack because we can't rely on the * stack pointer being < 4GB. */ - if (!IS_ENABLED(CONFIG_EFI_MIXED)) + if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native()) return 0; - /* - * Map all of RAM so that we can access arguments in the 1:1 - * mapping when making EFI runtime calls. - */ - for_each_efi_memory_desc(md) { - if (md->type != EFI_CONVENTIONAL_MEMORY && - md->type != EFI_LOADER_DATA && - md->type != EFI_LOADER_CODE) - continue; - - pfn = md->phys_addr >> PAGE_SHIFT; - npages = md->num_pages; - - if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) { - pr_err("Failed to map 1:1 memory\n"); - return 1; - } - } - page = alloc_page(GFP_KERNEL|__GFP_DMA32); if (!page) panic("Unable to allocate EFI runtime stack < 4GB\n"); @@ -359,6 +339,7 @@ void __init efi_map_region(efi_memory_desc_t *md) */ void __init efi_map_region_fixed(efi_memory_desc_t *md) { + __map_region(md, md->phys_addr); __map_region(md, md->virt_addr); } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 89d1146f5a6f..10aca63a50d7 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -164,6 +164,75 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, EXPORT_SYMBOL_GPL(efi_query_variable_store); /* + * The UEFI specification makes it clear that the operating system is + * free to do whatever it wants with boot services code after + * ExitBootServices() has been called. Ignoring this recommendation a + * significant bunch of EFI implementations continue calling into boot + * services code (SetVirtualAddressMap). In order to work around such + * buggy implementations we reserve boot services region during EFI + * init and make sure it stays executable. Then, after + * SetVirtualAddressMap(), it is discarded. + * + * However, some boot services regions contain data that is required + * by drivers, so we need to track which memory ranges can never be + * freed. This is done by tagging those regions with the + * EFI_MEMORY_RUNTIME attribute. + * + * Any driver that wants to mark a region as reserved must use + * efi_mem_reserve() which will insert a new EFI memory descriptor + * into efi.memmap (splitting existing regions if necessary) and tag + * it with EFI_MEMORY_RUNTIME. + */ +void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) +{ + phys_addr_t new_phys, new_size; + struct efi_mem_range mr; + efi_memory_desc_t md; + int num_entries; + void *new; + + if (efi_mem_desc_lookup(addr, &md)) { + pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr); + return; + } + + if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) { + pr_err("Region spans EFI memory descriptors, %pa\n", &addr); + return; + } + + size += addr % EFI_PAGE_SIZE; + size = round_up(size, EFI_PAGE_SIZE); + addr = round_down(addr, EFI_PAGE_SIZE); + + mr.range.start = addr; + mr.range.end = addr + size - 1; + mr.attribute = md.attribute | EFI_MEMORY_RUNTIME; + + num_entries = efi_memmap_split_count(&md, &mr.range); + num_entries += efi.memmap.nr_map; + + new_size = efi.memmap.desc_size * num_entries; + + new_phys = memblock_alloc(new_size, 0); + if (!new_phys) { + pr_err("Could not allocate boot services memmap\n"); + return; + } + + new = early_memremap(new_phys, new_size); + if (!new) { + pr_err("Failed to map new boot services memmap\n"); + return; + } + + efi_memmap_insert(&efi.memmap, new, &mr); + early_memunmap(new, new_size); + + efi_memmap_install(new_phys, num_entries); +} + +/* * Helper function for efi_reserve_boot_services() to figure out if we * can free regions in efi_free_boot_services(). * @@ -184,15 +253,6 @@ static bool can_free_region(u64 start, u64 size) return true; } -/* - * The UEFI specification makes it clear that the operating system is free to do - * whatever it wants with boot services code after ExitBootServices() has been - * called. Ignoring this recommendation a significant bunch of EFI implementations - * continue calling into boot services code (SetVirtualAddressMap). In order to - * work around such buggy implementations we reserve boot services region during - * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it -* is discarded. -*/ void __init efi_reserve_boot_services(void) { efi_memory_desc_t *md; @@ -249,7 +309,10 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { + phys_addr_t new_phys, new_size; efi_memory_desc_t *md; + int num_entries = 0; + void *new, *new_md; for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; @@ -257,12 +320,16 @@ void __init efi_free_boot_services(void) size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) + md->type != EFI_BOOT_SERVICES_DATA) { + num_entries++; continue; + } /* Do not free, someone else owns it: */ - if (md->attribute & EFI_MEMORY_RUNTIME) + if (md->attribute & EFI_MEMORY_RUNTIME) { + num_entries++; continue; + } /* * Nasty quirk: if all sub-1MB memory is used for boot @@ -287,7 +354,41 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } - efi_unmap_memmap(); + new_size = efi.memmap.desc_size * num_entries; + new_phys = memblock_alloc(new_size, 0); + if (!new_phys) { + pr_err("Failed to allocate new EFI memmap\n"); + return; + } + + new = memremap(new_phys, new_size, MEMREMAP_WB); + if (!new) { + pr_err("Failed to map new EFI memmap\n"); + return; + } + + /* + * Build a new EFI memmap that excludes any boot services + * regions that are not tagged EFI_MEMORY_RUNTIME, since those + * regions have now been freed. + */ + new_md = new; + for_each_efi_memory_desc(md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + (md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA)) + continue; + + memcpy(new_md, md, efi.memmap.desc_size); + new_md += efi.memmap.desc_size; + } + + memunmap(new); + + if (efi_memmap_install(new_phys, num_entries)) { + pr_err("Could not install new EFI memmap\n"); + return; + } } /* @@ -365,7 +466,7 @@ void __init efi_apply_memmap_quirks(void) */ if (!efi_runtime_supported()) { pr_info("Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); + efi_memmap_unmap(); } /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */ diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 23f2f3e41c7f..b4d5e95fe4df 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -149,11 +149,8 @@ EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); s64 uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) { - s64 ret; - - ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, - (u64)addr, buf, (u64)len, 0); - return ret; + return uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); } EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 56c5a3a3884a..9e42842e924a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -603,11 +603,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, */ static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc) { - unsigned long descriptor_status; - - descriptor_status = - ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; - return descriptor_status; + return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; } /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b12c26e2e309..53cace2ec0e2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -130,7 +130,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); - ctxt->cr4 = __read_cr4_safe(); + ctxt->cr4 = __read_cr4(); #ifdef CONFIG_X86_64 ctxt->cr8 = read_cr8(); #endif diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 1104515d5ad2..1ac76479c266 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); /* * Caller needs to be make sure this cpu doesn't disappear @@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid) static void prepare_msrs(void *info) { - struct mce i_mce = *(struct mce *)info; - u8 b = i_mce.bank; + struct mce m = *(struct mce *)info; + u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus); + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (i_mce.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr); + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr); + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc); + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); } - } static void do_inject(void) @@ -275,6 +278,9 @@ static void do_inject(void) if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + if (inj_type == SW_INJ) { mce_inject_log(&i_mce); return; @@ -301,7 +307,9 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); cpu = get_nbc_for_node(amd_get_nb_id(cpu)); } @@ -371,6 +379,9 @@ static const char readme_msg[] = "\t used for error thresholding purposes and its validity is indicated by\n" "\t MCi_STATUS[MiscV].\n" "\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" "addr:\t Error address value to be written to MCi_ADDR. Log address information\n" "\t associated with the error.\n" "\n" @@ -420,6 +431,7 @@ static struct dfs_node { { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, @@ -428,7 +440,7 @@ static struct dfs_node { static int __init init_mce_inject(void) { - int i; + unsigned int i; u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); @@ -452,26 +464,22 @@ static int __init init_mce_inject(void) return 0; err_dfs_add: - while (--i >= 0) + while (i-- > 0) debugfs_remove(dfs_fls[i].d); debugfs_remove(dfs_inj); dfs_inj = NULL; - return -ENOMEM; + return -ENODEV; } static void __exit exit_mce_inject(void) { - int i; - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) - debugfs_remove(dfs_fls[i].d); + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; memset(&dfs_fls, 0, sizeof(dfs_fls)); - - debugfs_remove(dfs_inj); - dfs_inj = NULL; } module_init(init_mce_inject); module_exit(exit_mce_inject); diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a7ef7b131e25..5766ead6fdb9 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -194,7 +194,7 @@ int peek_user(struct task_struct *child, long addr, long data) static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_cpu(child); struct user_i387_struct fpregs; err = save_i387_registers(userspace_pid[cpu], @@ -211,7 +211,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_cpu(child); struct user_i387_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); @@ -224,7 +224,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_cpu(child); struct user_fxsr_struct fpregs; err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); @@ -240,7 +240,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct * static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_cpu(child); struct user_fxsr_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b86ebb1a9a7f..f1d2182e071f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1237,7 +1237,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, #ifdef CONFIG_X86_64 @@ -1925,6 +1924,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c) } } +static void xen_pin_vcpu(int cpu) +{ + static bool disable_pinning; + struct sched_pin_override pin_override; + int ret; + + if (disable_pinning) + return; + + pin_override.pcpu = cpu; + ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override); + + /* Ignore errors when removing override. */ + if (cpu < 0) + return; + + switch (ret) { + case -ENOSYS: + pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n", + cpu); + disable_pinning = true; + break; + case -EPERM: + WARN(1, "Trying to pin vcpu without having privilege to do so\n"); + disable_pinning = true; + break; + case -EINVAL: + case -EBUSY: + pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n", + cpu); + break; + case 0: + break; + default: + WARN(1, "rc %d while trying to pin vcpu\n", ret); + disable_pinning = true; + } +} + const struct hypervisor_x86 x86_hyper_xen = { .name = "Xen", .detect = xen_platform, @@ -1933,6 +1971,7 @@ const struct hypervisor_x86 x86_hyper_xen = { #endif .x2apic_available = xen_x2apic_para_available, .set_cpu_features = xen_set_cpu_features, + .pin_vcpu = xen_pin_vcpu, }; EXPORT_SYMBOL(x86_hyper_xen); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 176425233e4d..f8960fca0827 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -861,7 +861,7 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); /* * Check whether the kernel itself conflicts with the target E820 map. diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index f42e78de1e10..3d6e0064cbfc 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -21,8 +21,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static bool xen_pvspin = true; -#ifdef CONFIG_QUEUED_SPINLOCKS - #include <asm/qspinlock.h> static void xen_qlock_kick(int cpu) @@ -71,207 +69,6 @@ static void xen_qlock_wait(u8 *byte, u8 val) xen_poll_irq(irq); } -#else /* CONFIG_QUEUED_SPINLOCKS */ - -enum xen_contention_stat { - TAKEN_SLOW, - TAKEN_SLOW_PICKUP, - TAKEN_SLOW_SPURIOUS, - RELEASED_SLOW, - RELEASED_SLOW_KICKED, - NR_CONTENTION_STATS -}; - - -#ifdef CONFIG_XEN_DEBUG_FS -#define HISTO_BUCKETS 30 -static struct xen_spinlock_stats -{ - u32 contention_stats[NR_CONTENTION_STATS]; - u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_blocked; -} spinlock_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - u8 ret; - u8 old = READ_ONCE(zero_stats); - if (unlikely(old)) { - ret = cmpxchg(&zero_stats, old, 0); - /* This ensures only one fellow resets the stat */ - if (ret == old) - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - } -} - -static inline void add_stats(enum xen_contention_stat var, u32 val) -{ - check_zero(); - spinlock_stats.contention_stats[var] += val; -} - -static inline u64 spin_time_start(void) -{ - return xen_clocksource_read(); -} - -static void __spin_time_accum(u64 delta, u32 *array) -{ - unsigned index = ilog2(delta); - - check_zero(); - - if (index < HISTO_BUCKETS) - array[index]++; - else - array[HISTO_BUCKETS]++; -} - -static inline void spin_time_accum_blocked(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); - spinlock_stats.time_blocked += delta; -} -#else /* !CONFIG_XEN_DEBUG_FS */ -static inline void add_stats(enum xen_contention_stat var, u32 val) -{ -} - -static inline u64 spin_time_start(void) -{ - return 0; -} - -static inline void spin_time_accum_blocked(u64 start) -{ -} -#endif /* CONFIG_XEN_DEBUG_FS */ - -struct xen_lock_waiting { - struct arch_spinlock *lock; - __ticket_t want; -}; - -static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); -static cpumask_t waiting_cpus; - -__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) -{ - int irq = __this_cpu_read(lock_kicker_irq); - struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting); - int cpu = smp_processor_id(); - u64 start; - __ticket_t head; - unsigned long flags; - - /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) - return; - - start = spin_time_start(); - - /* - * Make sure an interrupt handler can't upset things in a - * partially setup state. - */ - local_irq_save(flags); - /* - * We don't really care if we're overwriting some other - * (lock,want) pair, as that would mean that we're currently - * in an interrupt context, and the outer context had - * interrupts enabled. That has already kicked the VCPU out - * of xen_poll_irq(), so it will just return spuriously and - * retry with newly setup (lock,want). - * - * The ordering protocol on this is that the "lock" pointer - * may only be set non-NULL if the "want" ticket is correct. - * If we're updating "want", we must first clear "lock". - */ - w->lock = NULL; - smp_wmb(); - w->want = want; - smp_wmb(); - w->lock = lock; - - /* This uses set_bit, which atomic and therefore a barrier */ - cpumask_set_cpu(cpu, &waiting_cpus); - add_stats(TAKEN_SLOW, 1); - - /* clear pending */ - xen_clear_irq_pending(irq); - - /* Only check lock once pending cleared */ - barrier(); - - /* - * Mark entry to slowpath before doing the pickup test to make - * sure we don't deadlock with an unlocker. - */ - __ticket_enter_slowpath(lock); - - /* make sure enter_slowpath, which is atomic does not cross the read */ - smp_mb__after_atomic(); - - /* - * check again make sure it didn't become free while - * we weren't looking - */ - head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(head, want)) { - add_stats(TAKEN_SLOW_PICKUP, 1); - goto out; - } - - /* Allow interrupts while blocked */ - local_irq_restore(flags); - - /* - * If an interrupt happens here, it will leave the wakeup irq - * pending, which will cause xen_poll_irq() to return - * immediately. - */ - - /* Block until irq becomes pending (or perhaps a spurious wakeup) */ - xen_poll_irq(irq); - add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - - local_irq_save(flags); - - kstat_incr_irq_this_cpu(irq); -out: - cpumask_clear_cpu(cpu, &waiting_cpus); - w->lock = NULL; - - local_irq_restore(flags); - - spin_time_accum_blocked(start); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); - -static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) -{ - int cpu; - - add_stats(RELEASED_SLOW, 1); - - for_each_cpu(cpu, &waiting_cpus) { - const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); - - /* Make sure we read lock before want */ - if (READ_ONCE(w->lock) == lock && - READ_ONCE(w->want) == next) { - add_stats(RELEASED_SLOW_KICKED, 1); - xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; - } - } -} -#endif /* CONFIG_QUEUED_SPINLOCKS */ - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); @@ -334,16 +131,12 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); -#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = xen_qlock_wait; pv_lock_ops.kick = xen_qlock_kick; -#else - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); - pv_lock_ops.unlock_kick = xen_unlock_kick; -#endif } /* @@ -372,44 +165,3 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS) - -static struct dentry *d_spin_debug; - -static int __init xen_spinlock_debugfs(void) -{ - struct dentry *d_xen = xen_init_debugfs(); - - if (d_xen == NULL) - return -ENOMEM; - - if (!xen_pvspin) - return 0; - - d_spin_debug = debugfs_create_dir("spinlocks", d_xen); - - debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - - debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW]); - debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); - debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - - debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW]); - debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - - debugfs_create_u64("time_blocked", 0444, d_spin_debug, - &spinlock_stats.time_blocked); - - debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); - - return 0; -} -fs_initcall(xen_spinlock_debugfs); - -#endif /* CONFIG_XEN_DEBUG_FS */ |