diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 01:56:41 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 01:56:41 +0300 |
commit | 8f147727030bf9e81331ab9b8f42d4611bb6a3d9 (patch) | |
tree | d3f1e2410174bb8c479590a8f1c7e204e3a48eaf | |
parent | 53f8b081c184328b82c8a7b5e70b8243b3cea8bd (diff) | |
parent | 2c4645439e8f2f6e7c37f158feae6f6a82baa910 (diff) | |
download | linux-8f147727030bf9e81331ab9b8f42d4611bb6a3d9.tar.xz |
Merge branch 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 irq updates from Ingo Molnar:
"Here are the main changes in this tree:
- Introduce x86-64 IRQ/exception/debug stack guard pages to detect
stack overflows immediately and deterministically.
- Clean up over a decade worth of cruft accumulated.
The outcome of this should be more clear-cut faults/crashes when any
of the low level x86 CPU stacks overflow, instead of silent memory
corruption and sporadic failures much later on"
* 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
x86/irq: Fix outdated comments
x86/irq/64: Remove stack overflow debug code
x86/irq/64: Remap the IRQ stack with guard pages
x86/irq/64: Split the IRQ stack into its own pages
x86/irq/64: Init hardirq_stack_ptr during CPU hotplug
x86/irq/32: Handle irq stack allocation failure proper
x86/irq/32: Invoke irq_ctx_init() from init_IRQ()
x86/irq/64: Rename irq_stack_ptr to hardirq_stack_ptr
x86/irq/32: Rename hard/softirq_stack to hard/softirq_stack_ptr
x86/irq/32: Make irq stack a character array
x86/irq/32: Define IRQ_STACK_SIZE
x86/dumpstack/64: Speedup in_exception_stack()
x86/exceptions: Split debug IST stack
x86/exceptions: Enable IST guard pages
x86/exceptions: Disconnect IST index and stack order
x86/cpu: Remove orig_ist array
x86/cpu: Prepare TSS.IST setup for guard pages
x86/dumpstack/64: Use cpu_entry_area instead of orig_ist
x86/irq/64: Use cpu entry area instead of orig_ist
x86/traps: Use cpu_entry_area instead of orig_ist
...
33 files changed, 377 insertions, 317 deletions
diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks index 9a0aa4d3a866..d1bfb0b95ee0 100644 --- a/Documentation/x86/kernel-stacks +++ b/Documentation/x86/kernel-stacks @@ -59,7 +59,7 @@ If that assumption is ever broken then the stacks will become corrupt. The currently assigned IST stacks are :- -* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_DF. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 8 - Double Fault Exception (#DF). @@ -68,7 +68,7 @@ The currently assigned IST stacks are :- Using a separate stack allows the kernel to recover from it well enough in many cases to still output an oops. -* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_NMI. EXCEPTION_STKSZ (PAGE_SIZE). Used for non-maskable interrupts (NMI). @@ -76,7 +76,7 @@ The currently assigned IST stacks are :- middle of switching stacks. Using IST for NMI events avoids making assumptions about the previous state of the kernel stack. -* DEBUG_STACK. DEBUG_STKSZ +* ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE). Used for hardware debug interrupts (interrupt 1) and for software debug interrupts (INT3). @@ -86,7 +86,12 @@ The currently assigned IST stacks are :- avoids making assumptions about the previous state of the kernel stack. -* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + To handle nested #DB correctly there exist two instances of DB stacks. On + #DB entry the IST stackpointer for #DB is switched to the second instance + so a nested #DB starts from a clean stack. The nested #DB switches + the IST stackpointer to a guard hole to catch triple nesting. + +* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 18 - Machine Check Exception (#MC). diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7e59efc70b91..db95da6d644d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,6 +14,7 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select HAVE_DEBUG_STACKOVERFLOW select MODULES_USE_ELF_REL select OLD_SIGACTION @@ -138,7 +139,6 @@ config X86 select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK - select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e7e270603fe7..20e45d9b4e15 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif #ifdef CONFIG_RETPOLINE @@ -430,8 +430,8 @@ END(irq_entries_start) * it before we actually move ourselves to the IRQ stack. */ - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(irq_stack_ptr), %rsp + movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY /* @@ -840,7 +840,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) /** * idtentry - Generate an IDT entry stub @@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -924,13 +924,13 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ @@ -1128,7 +1128,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 29c706415443..cff3f3f3bfe0 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -7,6 +7,64 @@ #include <asm/processor.h> #include <asm/intel_ds.h> +#ifdef CONFIG_X86_64 + +/* Macro to enforce the same ordering and stack sizes */ +#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB2_stack_guard[guardsize]; \ + char DB2_stack[db2_holesize]; \ + char DB1_stack_guard[guardsize]; \ + char DB1_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[EXCEPTION_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char IST_top_guard[guardsize]; \ + +/* The exception stacks' physical storage. No guard pages required */ +struct exception_stacks { + ESTACKS_MEMBERS(0, 0) +}; + +/* The effective cpu entry area mapping with guard pages. */ +struct cea_exception_stacks { + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) +}; + +/* + * The exception stack ordering in [cea_]exception_stacks + */ +enum exception_stack_ordering { + ESTACK_DF, + ESTACK_NMI, + ESTACK_DB2, + ESTACK_DB1, + ESTACK_DB, + ESTACK_MCE, + N_EXCEPTION_STACKS +}; + +#define CEA_ESTACK_SIZE(st) \ + sizeof(((struct cea_exception_stacks *)0)->st## _stack) + +#define CEA_ESTACK_BOT(ceastp, st) \ + ((unsigned long)&(ceastp)->st## _stack) + +#define CEA_ESTACK_TOP(ceastp, st) \ + (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st)) + +#define CEA_ESTACK_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack) + +#define CEA_ESTACK_PAGES \ + (sizeof(struct cea_exception_stacks) / PAGE_SIZE) + +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -32,12 +90,9 @@ struct cpu_entry_area { #ifdef CONFIG_X86_64 /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. + * Exception stacks used for IST entries with guard pages. */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + struct cea_exception_stacks estacks; #endif #ifdef CONFIG_CPU_SUP_INTEL /* @@ -57,6 +112,7 @@ struct cpu_entry_area { #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); @@ -76,4 +132,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } +#define __this_cpu_ist_top_va(name) \ + CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9e5ca30738e5..1a8609a15856 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void) { __this_cpu_dec(debug_stack_usage); } -int is_debug_stack(unsigned long addr); void debug_stack_set_zero(void); void debug_stack_reset(void); #else /* !X86_64 */ -static inline int is_debug_stack(unsigned long addr) { return 0; } static inline void debug_stack_set_zero(void) { } static inline void debug_stack_reset(void) { } static inline void debug_stack_usage_inc(void) { } diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index fbb16e6b6c18..8f95686ec27e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_32 -extern void irq_ctx_init(int cpu); -#else -# define irq_ctx_init(cpu) do { } while (0) -#endif +extern int irq_init_percpu_irqstack(unsigned int cpu); #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 548d90bbf919..889f8b1b5b7f 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -18,8 +18,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts - * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts + * Vectors 129 ... LOCAL_TIMER_VECTOR-1 + * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0d5c739eebd7..565ad755c785 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -22,11 +22,9 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 0 -#define DEBUG_STACK 0 -#define MCE_STACK 0 -#define N_EXCEPTION_STACKS 1 +#define IRQ_STACK_SIZE THREAD_SIZE + +#define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE /* diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8f657286d599..793c14c372cb 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,22 +14,20 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) -#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) - #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 2 -#define DEBUG_STACK 3 -#define MCE_STACK 4 -#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ +/* + * The index for the tss.ist[] array. The hardware limit is 7 entries. + */ +#define IST_INDEX_DF 0 +#define IST_INDEX_NMI 1 +#define IST_INDEX_DB 2 +#define IST_INDEX_MCE 3 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2bb3a648fc12..7e99ef67bff0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); #define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) +/* Per CPU interrupt stacks */ +struct irq_stack { + char stack[IRQ_STACK_SIZE]; +} __aligned(IRQ_STACK_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -374,38 +381,25 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); - -union irq_stack_union { - char irq_stack[IRQ_STACK_SIZE]; +struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ - struct { - char gs_base[40]; - unsigned long stack_canary; - }; + char gs_base[40]; + unsigned long stack_canary; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -DECLARE_INIT_PER_CPU(irq_stack_union); +DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; +DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); @@ -427,15 +421,8 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* - * per-CPU IRQ handling stacks - */ -struct irq_stack { - u32 stack[THREAD_SIZE/sizeof(u32)]; -} __aligned(THREAD_SIZE); - -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack); +/* Per CPU softirq stack pointer */ +DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ extern unsigned int fpu_kernel_xstate_size; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 2e95b6c1bca3..da545df207b2 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -131,7 +131,7 @@ void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); -void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); +int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245..91e29b6a86a5 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -13,7 +13,7 @@ * On x86_64, %gs is shared by percpu area and stack canary. All * percpu symbols are zero based and %gs points to the base of percpu * area. The first occupant of the percpu area is always - * irq_stack_union which contains stack_canary at offset 40. Userland + * fixed_percpu_data which contains stack_canary at offset 40. Userland * %gs is always saved and restored on kernel entry and exit using * swapgs, so stack protector doesn't add any complexity there. * @@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void) u64 tsc; #ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); + BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); #endif /* * We both use the random pool and the current TSC as a source @@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - this_cpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(fixed_percpu_data.stack_canary, canary); #else this_cpu_write(stack_canary.canary, canary); #endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index beef7ad9e43a..a8d0cdf48616 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -9,6 +9,8 @@ #include <linux/uaccess.h> #include <linux/ptrace.h> + +#include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b5..d3d075226c0a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,10 +68,12 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 95a5faf3a6a0..37f7d438a6ef 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,19 +507,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1511,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1523,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; @@ -1562,23 +1547,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1690,17 +1659,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1681,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1718,11 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == DEBUG_STACK-1) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..64a59d726639 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..753b8cfe8b8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,23 +16,21 @@ #include <linux/bug.h> #include <linux/nmi.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", -}; - -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type) return NULL; } +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB1), + EPAGERANGE(DB), + EPAGERANGE(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; struct pt_regs *regs; - unsigned k; + unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); - regs = (struct pt_regs *)end - 1; - - if (stack <= begin || stack >= end) - continue; + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; - info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; - info->next_sp = (unsigned long *)regs->sp; + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; - return true; - } + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; - return false; + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; } static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb82..bcd206c8ac90 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,13 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) - -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 95600a99ae93..fc34816c6f04 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); static void call_on_stack(void *func, void *stack) { @@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) } /* - * allocate per-cpu stacks for hardirq and for softirq processing + * Allocate per-cpu stacks for hardirq and softirq processing */ -void irq_ctx_init(int cpu) +int irq_init_percpu_irqstack(unsigned int cpu) { - struct irq_stack *irqstk; - - if (per_cpu(hardirq_stack, cpu)) - return; + int node = cpu_to_node(cpu); + struct page *ph, *ps; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack, cpu) = irqstk; + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(softirq_stack, cpu) = irqstk; + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } - printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; } void do_softirq_own_stack(void) @@ -135,7 +136,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..6bf6517a05bb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,63 +18,64 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/sched/task_stack.h> + +#include <asm/cpu_entry_area.h> #include <asm/io_apic.h> #include <asm/apic.h> -int sysctl_panic_on_stackoverflow; +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); -/* - * Probabilistic stack overflow check: - * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. - */ -static inline void stack_overflow_check(struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { -#ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; - u64 curbase = (u64)task_stack_page(current); + if (IS_ERR_OR_NULL(desc)) + return false; - if (user_mode(regs)) - return; + generic_handle_irq_desc(desc); + return true; +} - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && - regs->sp <= curbase + THREAD_SIZE) - return; +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) - return; + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) - return; + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); -#endif + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; } - -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) { - stack_overflow_check(regs); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - if (IS_ERR_OR_NULL(desc)) - return false; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#endif - generic_handle_irq_desc(desc); - return true; +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1..16919a9671fa 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,6 +91,8 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + x86_init.irqs.intr_init(); } @@ -104,6 +106,4 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); - - irq_ctx_init(smp_processor_id()); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..3755d0310026 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/atomic.h> #include <linux/sched/clock.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <linux/atomic.h> +#include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> #include <asm/nmi.h> @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4bf46575568a..86663874ef04 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif -#ifdef CONFIG_X86_64 - per_cpu(irq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE; -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce1a67b70168..c92b21f9e9dc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -935,20 +935,27 @@ out: return boot_error; } -void common_cpu_up(unsigned int cpu, struct task_struct *idle) +int common_cpu_up(unsigned int cpu, struct task_struct *idle) { + int ret; + /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif + return 0; } /* @@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; - common_cpu_up(cpu, tidle); + err = common_cpu_up(cpu, tidle); + if (err) + return err; err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a5127b2c195f..4d1517022a14 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -403,7 +403,8 @@ SECTIONS */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); /* * Build-time check on the image size: @@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 19c6abf9ea31..752ad11d6868 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -13,8 +13,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); +DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } -static void __init percpu_setup_debug_store(int cpu) +static void __init percpu_setup_debug_store(unsigned int cpu) { #ifdef CONFIG_CPU_SUP_INTEL - int npages; + unsigned int npages; void *cea; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) @@ -78,9 +78,43 @@ static void __init percpu_setup_debug_store(int cpu) #endif } +#ifdef CONFIG_X86_64 + +#define cea_map_stack(name) do { \ + npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ + cea_map_percpu_pages(cea->estacks.name## _stack, \ + estacks->name## _stack, npages, PAGE_KERNEL); \ + } while (0) + +static void __init percpu_setup_exception_stacks(unsigned int cpu) +{ + struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + unsigned int npages; + + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + + per_cpu(cea_exception_stacks, cpu) = &cea->estacks; + + /* + * The exceptions stack mappings in the per cpu area are protected + * by guard pages so each stack must be mapped separately. DB2 is + * not mapped; it just exists to catch triple nesting of #DB. + */ + cea_map_stack(DF); + cea_map_stack(NMI); + cea_map_stack(DB1); + cea_map_stack(DB); + cea_map_stack(MCE); +} +#else +static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +#endif + /* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(unsigned int cpu) { + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; @@ -101,10 +135,9 @@ static void __init setup_cpu_entry_area(int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif - cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), - gdt_prot); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + cea_map_percpu_pages(&cea->entry_stack_page, per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); @@ -128,22 +161,15 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, - &per_cpu(cpu_tss_rw, cpu), + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + per_cpu(cpu_entry_area, cpu) = cea; #endif -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, - &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); -#endif + percpu_setup_exception_stacks(cpu); + percpu_setup_debug_store(cpu); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 667f1da36208..06c089513d39 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,6 +28,7 @@ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_recover_from_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ +#include <asm/cpu_entry_area.h> /* exception stack */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -793,7 +794,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index f345586f5e50..ce7188cbdae5 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -754,7 +754,7 @@ static void percpu_init(void) * __per_cpu_load * * The "gold" linker incorrectly associates: - * init_per_cpu__irq_stack_union + * init_per_cpu__fixed_percpu_data * init_per_cpu__gdt_page */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 145506f9fdbe..590fcf863006 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -361,7 +361,9 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; - common_cpu_up(cpu, idle); + rc = common_cpu_up(cpu, idle); + if (rc) + return rc; xen_setup_runstate_info(cpu); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 5077ead5e59c..c1d8b90aa4e2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -40,13 +40,13 @@ ENTRY(startup_xen) #ifdef CONFIG_X86_64 /* Set up %gs. * - * The base of %gs always points to the bottom of the irqstack - * union. If the stack protector canary is enabled, it is - * located at %gs:40. Note that, on SMP, the boot cpu uses - * init data section till per cpu areas are set up. + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax cdq wrmsr #endif diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 117e76b2f939..084e45882c73 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1687,7 +1687,6 @@ void __init xen_init_IRQ(void) #ifdef CONFIG_X86 if (xen_pv_domain()) { - irq_ctx_init(smp_processor_id()); if (xen_initial_domain()) pci_xen_initial_domain(); } diff --git a/mm/slab.c b/mm/slab.c index 9142ee992493..284ab737faee 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1467,53 +1467,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) } #ifdef CONFIG_DEBUG_PAGEALLOC -static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, - unsigned long caller) -{ - int size = cachep->object_size; - - addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; - - if (size < 5 * sizeof(unsigned long)) - return; - - *addr++ = 0x12345678; - *addr++ = caller; - *addr++ = smp_processor_id(); - size -= 3 * sizeof(unsigned long); - { - unsigned long *sptr = &caller; - unsigned long svalue; - - while (!kstack_end(sptr)) { - svalue = *sptr++; - if (kernel_text_address(svalue)) { - *addr++ = svalue; - size -= sizeof(unsigned long); - if (size <= sizeof(unsigned long)) - break; - } - } - - } - *addr++ = 0x87654321; -} - -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) { if (!is_debug_pagealloc_cache(cachep)) return; - if (caller) - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); } #else static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) {} + int map) {} #endif @@ -1661,7 +1625,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -2433,7 +2397,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, 0); + slab_kernel_map(cachep, objp, 0); } } #endif @@ -2812,7 +2776,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, caller); + slab_kernel_map(cachep, objp, 0); } return objp; } @@ -3076,7 +3040,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) |