summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/switch_to.h28
-rw-r--r--arch/x86/kernel/traps.c61
-rw-r--r--arch/x86/mm/tlb.c15
4 files changed, 104 insertions, 1 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..21a6d0ec5983 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_EBPF_JIT if X86_64
+ select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8f321a1b03a1..14e4b20f0aaf 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,28 @@ struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+ struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we switch to a stack that has a top-level paging entry
+ * that is not present in the current mm, the resulting #PF will
+ * will be promoted to a double-fault and we'll panic. Probe
+ * the new stack now so that vmalloc_fault can fix up the page
+ * tables if needed. This can only happen if we use a stack
+ * in vmap space.
+ *
+ * We assume that the stack is aligned so that it never spans
+ * more than one top-level paging entry.
+ *
+ * To minimize cache pollution, just follow the stack pointer.
+ */
+ READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
#ifdef CONFIG_X86_32
#ifdef CONFIG_CC_STACKPROTECTOR
@@ -39,6 +61,8 @@ do { \
*/ \
unsigned long ebx, ecx, edx, esi, edi; \
\
+ prepare_switch_to(prev, next); \
+ \
asm volatile("pushl %%ebp\n\t" /* save EBP */ \
"movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
@@ -103,7 +127,9 @@ do { \
* clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
* has no effect.
*/
-#define switch_to(prev, next, last) \
+#define switch_to(prev, next, last) \
+ prepare_switch_to(prev, next); \
+ \
asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b70ca12dd389..907b4e4aeb5e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+#ifdef CONFIG_VMAP_STACK
+static void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address)
+{
+ printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+ (void *)fault_address, current->stack,
+ (char *)current->stack + THREAD_SIZE - 1);
+ die(message, regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic(message);
+}
+#endif
+
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long cr2;
+#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_DF;
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * deliv- ered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ cr2 = read_cr2();
+ if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+ handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
#ifdef CONFIG_DOUBLEFAULT
df_debug(regs, error_code);
#endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4dbe65622810..a7655f6caf7d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+
+ pgd_t *pgd = next->pgd + stack_pgd_index;
+
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ }
+
#ifdef CONFIG_SMP
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);
#endif
+
cpumask_set_cpu(cpu, mm_cpumask(next));
/*