From 72f5e08dbba2d01aa90b592cf76c378ea233b00b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:20 +0100 Subject: x86/entry: Remap the TSS into the CPU entry area This has a secondary purpose: it puts the entry stack into a region with a well-controlled layout. A subsequent patch will take advantage of this to streamline the SYSCALL entry code to be able to find it more easily. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4838037f97f6..0ab316c46806 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -941,7 +941,8 @@ ENTRY(debug) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -984,7 +985,8 @@ ENTRY(nmi) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack -- cgit v1.2.3 From 0f9a48100fba3f189724ae88a450c2261bf91c80 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:28 +0100 Subject: x86/entry: Clean up the SYSENTER_stack code The existing code was a mess, mainly because C arrays are nasty. Turn SYSENTER_stack into a struct, add a helper to find it, and do all the obvious cleanups this enables. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/fixmap.h | 5 +++++ arch/x86/include/asm/processor.h | 6 +++++- arch/x86/kernel/asm-offsets.c | 6 ++---- arch/x86/kernel/cpu/common.c | 14 +++----------- arch/x86/kernel/dumpstack.c | 7 +++---- 7 files changed, 21 insertions(+), 23 deletions(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0ab316c46806..3629bcbf85a2 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,7 +942,7 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -986,7 +986,7 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2582984ffb4b..575b184f377f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,7 +154,7 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 451da7d9a502..cc5d98bdca37 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -245,5 +245,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); } +static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d34ac13c5866..f933869470b8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -336,12 +336,16 @@ struct x86_hw_tss { #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 +struct SYSENTER_stack { + unsigned long words[64]; +}; + struct tss_struct { /* * Space for the temporary SYSENTER stack, used for SYSENTER * and the entry trampoline as well. */ - unsigned long SYSENTER_stack[64]; + struct SYSENTER_stack SYSENTER_stack; /* * The fixed hardware portion. This must not cross a page boundary diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 61b1af88ac07..46c0995344aa 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -94,10 +94,8 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fb01a8e5e9b7..3de7480e4f32 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1314,12 +1314,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1436,9 +1431,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1653,8 +1646,7 @@ void cpu_init(void) */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); - load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); + load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae1ce2e3f132..bbd6d986e2d0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) { - int cpu = smp_processor_id(); - struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); - void *begin = &tss->SYSENTER_stack; - void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + void *begin = ss; + void *end = ss + 1; if ((void *)stack < begin || (void *)stack >= end) return false; -- cgit v1.2.3 From c482feefe1aeb150156248ba0fd3e029bc886605 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:29 +0100 Subject: x86/entry/64: Make cpu_entry_area.tss read-only The TSS is a fairly juicy target for exploits, and, now that the TSS is in the cpu_entry_area, it's no longer protected by kASLR. Make it read-only on x86_64. On x86_32, it can't be RO because it's written by the CPU during task switches, and we use a task gate for double faults. I'd also be nervous about errata if we tried to make it RO even on configurations without double fault handling. [ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So it's probably safe to assume that it's a non issue, though Intel might have been creative in that area. Still waiting for confirmation. ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 8 ++++---- arch/x86/include/asm/fixmap.h | 13 +++++++++---- arch/x86/include/asm/processor.h | 17 ++++++++--------- arch/x86/include/asm/switch_to.h | 4 ++-- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/asm-offsets.c | 5 ++--- arch/x86/kernel/asm-offsets_32.c | 4 ++-- arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++---------- arch/x86/kernel/ioport.c | 2 +- arch/x86/kernel/process.c | 6 +++--- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- arch/x86/lib/delay.c | 4 ++-- arch/x86/xen/enlighten_pv.c | 2 +- 16 files changed, 60 insertions(+), 48 deletions(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3629bcbf85a2..bd8b57a5c874 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,7 +942,7 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -986,7 +986,7 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 575b184f377f..2812ce043a7a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,7 +154,7 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ +#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) @@ -390,7 +390,7 @@ syscall_return_via_sysret: * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -719,7 +719,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -934,7 +934,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) /* * Switch to the thread stack. This is called with the IRET frame and diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index cc5d98bdca37..94fc4fa14127 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -56,9 +56,14 @@ struct cpu_entry_area { char gdt[PAGE_SIZE]; /* - * The GDT is just below cpu_tss and thus serves (on x86_64) as a - * a read-only guard page for the SYSENTER stack at the bottom - * of the TSS region. + * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct SYSENTER_stack_page SYSENTER_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. */ struct tss_struct tss; @@ -247,7 +252,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) { - return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; + return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f933869470b8..e8991d7f7034 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -340,13 +340,11 @@ struct SYSENTER_stack { unsigned long words[64]; }; -struct tss_struct { - /* - * Space for the temporary SYSENTER stack, used for SYSENTER - * and the entry trampoline as well. - */ - struct SYSENTER_stack SYSENTER_stack; +struct SYSENTER_stack_page { + struct SYSENTER_stack stack; +} __aligned(PAGE_SIZE); +struct tss_struct { /* * The fixed hardware portion. This must not cross a page boundary * at risk of violating the SDM's advice and potentially triggering @@ -363,7 +361,7 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; } __aligned(PAGE_SIZE); -DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* * sizeof(unsigned long) coming from an extra "long" at the end @@ -378,7 +376,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else -#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 +/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ +#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif /* @@ -538,7 +537,7 @@ static inline void native_set_iopl_mask(unsigned mask) static inline void native_load_sp0(unsigned long sp0) { - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static inline void native_swapgs(void) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index cbc71e73bd32..9b6df68d8fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,10 +79,10 @@ do { \ static inline void refresh_sysenter_cs(struct thread_struct *thread) { /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) return; - this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } #endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 44a04999791e..00223333821a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) +# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) #endif #endif diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 46c0995344aa..cd360a5e0dca 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -94,10 +94,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); - DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); - /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 52ce4ea16e53..7d20d9c0b3d6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -47,8 +47,8 @@ void foo(void) BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3de7480e4f32..c2eada1056de 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif +static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, + SYSENTER_stack_storage); + static void __init set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) { @@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu) #ifdef CONFIG_X86_64 extern char _entry_trampoline[]; - /* On 64-bit systems, we use a read-only fixmap GDT. */ + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; #else /* * On native 32-bit systems, the GDT cannot be read-only because * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. */ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), + per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, + PAGE_KERNEL); /* * The Intel SDM says (Volume 3, 7.2.1): @@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu) offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), - &per_cpu(cpu_tss, cpu), + &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, - PAGE_KERNEL); + tss_prot); #ifdef CONFIG_X86_32 per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); @@ -1305,7 +1314,7 @@ void enable_sep_cpu(void) return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1575,7 +1584,7 @@ void cpu_init(void) if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1667,7 +1676,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6a04287f222b..517415978409 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower @@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e1..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 157f81816915..c75466232016 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5ade4f89a6d1..74136fd16f49 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -364,7 +364,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* * regs->sp points to the failing IRET frame on the @@ -649,7 +649,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc4..4846eff7e4c8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops) delay = min_t(u64, MWAITX_MAX_LOOPS, loops); /* - * Use cpu_tss as a cacheline-aligned, seldomly + * Use cpu_tss_rw as a cacheline-aligned, seldomly * accessed per-cpu variable as the monitor target. */ - __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index fbd054d6ac97..ae3a071e1d0f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) -- cgit v1.2.3 From 4fe2d8b11a370af286287a2661de9d4e6c9a145a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 17:25:07 -0800 Subject: x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack If the kernel oopses while on the trampoline stack, it will print "" even if SYSENTER is not involved. That is rather confusing. The "SYSENTER" stack is used for a lot more than SYSENTER now. Give it a better string to display in stack dumps, and rename the kernel code to match. Also move the 32-bit code over to the new naming even though it still uses the entry stack only for SYSENTER. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 12 ++++++------ arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/fixmap.h | 8 ++++---- arch/x86/include/asm/processor.h | 6 +++--- arch/x86/include/asm/stacktrace.h | 4 ++-- arch/x86/kernel/asm-offsets.c | 4 ++-- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/cpu/common.c | 14 +++++++------- arch/x86/kernel/dumpstack.c | 10 +++++----- arch/x86/kernel/dumpstack_32.c | 6 +++--- arch/x86/kernel/dumpstack_64.c | 12 +++++++++--- 11 files changed, 44 insertions(+), 38 deletions(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bd8b57a5c874..ace8f321a5a1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,9 +942,9 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Ldebug_from_sysenter_stack TRACE_IRQS_OFF @@ -986,9 +986,9 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2812ce043a7a..87cebe78bbef 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,8 +154,8 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ - SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA +#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) UNWIND_HINT_EMPTY diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 94fc4fa14127..8153b8d86a3c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -56,10 +56,10 @@ struct cpu_entry_area { char gdt[PAGE_SIZE]; /* - * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as + * The GDT is just below entry_stack and thus serves (on x86_64) as * a a read-only guard page. */ - struct SYSENTER_stack_page SYSENTER_stack_page; + struct entry_stack_page entry_stack_page; /* * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because @@ -250,9 +250,9 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); } -static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) +static inline struct entry_stack *cpu_entry_stack(int cpu) { - return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index da943411d3d8..9e482d8b0b97 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -336,12 +336,12 @@ struct x86_hw_tss { #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 -struct SYSENTER_stack { +struct entry_stack { unsigned long words[64]; }; -struct SYSENTER_stack_page { - struct SYSENTER_stack stack; +struct entry_stack_page { + struct entry_stack stack; } __aligned(PAGE_SIZE); struct tss_struct { diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f8062bfd43a0..f73706878772 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,7 +16,7 @@ enum stack_type { STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, - STACK_TYPE_SYSENTER, + STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; @@ -29,7 +29,7 @@ struct stack_info { bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); -bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); +bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cd360a5e0dca..676b7cf4b62b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -97,6 +97,6 @@ void common(void) { /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); - OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); - DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 7d20d9c0b3d6..fa1261eefa16 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -48,7 +48,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - - offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 034900623adf..ed4acbce37a8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,8 +487,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif -static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, - SYSENTER_stack_storage); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, + entry_stack_storage); static void __init set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) @@ -523,8 +523,8 @@ static void __init setup_cpu_entry_area(int cpu) #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), - per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), + per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); /* @@ -1323,7 +1323,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1440,7 +1440,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1655,7 +1655,7 @@ void cpu_init(void) */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); - load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index bbd6d986e2d0..1dd3f533d78c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -43,9 +43,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } -bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) +bool in_entry_stack(unsigned long *stack, struct stack_info *info) { - struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); void *begin = ss; void *end = ss + 1; @@ -53,7 +53,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) if ((void *)stack < begin || (void *)stack >= end) return false; - info->type = STACK_TYPE_SYSENTER; + info->type = STACK_TYPE_ENTRY; info->begin = begin; info->end = end; info->next_sp = NULL; @@ -111,13 +111,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) - * - SYSENTER stack + * - entry stack * * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack - * - SYSENTER stack + * - entry stack */ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5ff13a6b3680..04170f63e3a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; - if (type == STACK_TYPE_SYSENTER) - return "SYSENTER"; + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; return NULL; } @@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; - if (in_sysenter_stack(stack, info)) + if (in_entry_stack(stack, info)) goto recursion_check; if (in_hardirq_stack(stack, info)) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index abc828f8c297..563e28d14f2c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; - if (type == STACK_TYPE_SYSENTER) - return "SYSENTER"; + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; - if (in_sysenter_stack(stack, info)) + if (in_entry_stack(stack, info)) goto recursion_check; goto unknown; -- cgit v1.2.3 From 2641f08bb7fc63a636a2b18173221d7040a3512e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:28 +0000 Subject: x86/retpoline/entry: Convert entry assembler indirect jumps Convert indirect jumps in core 32/64bit entry assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return address after the 'call' instruction must be *precisely* at the .Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work, and the use of alternatives will mess that up unless we play horrid games to prepend with NOPs and make the variants the same length. It's not worth it; in the case where we ALTERNATIVE out the retpoline, the first instruction at __x86.indirect_thunk.rax is going to be a bare jmp *%rax anyway. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 5 +++-- arch/x86/entry/entry_64.S | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ace8f321a5a1..a1f28a54f23a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -290,7 +291,7 @@ ENTRY(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - call *%ebx + CALL_NOSPEC %ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -919,7 +920,7 @@ common_exception: movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_NOSPEC %edi jmp ret_from_exception END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ed31d00dc5ee..59874bc1aed2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "calling.h" @@ -187,7 +188,7 @@ ENTRY(entry_SYSCALL_64_trampoline) */ pushq %rdi movq $entry_SYSCALL_64_stage2, %rdi - jmp *%rdi + JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) .popsection @@ -266,7 +267,12 @@ entry_SYSCALL_64_fastpath: * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -438,7 +444,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_NOSPEC %rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -517,7 +523,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_NOSPEC %rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() -- cgit v1.2.3 From c995efd5a740d9cbafbf58bde4973e8b50b4d761 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 17:49:25 +0000 Subject: x86/retpoline: Fill RSB on context switch for affected CPUs On context switch from a shallow call stack to a deeper one, as the CPU does 'ret' up the deeper side it may encounter RSB entries (predictions for where the 'ret' goes to) which were populated in userspace. This is problematic if neither SMEP nor KPTI (the latter of which marks userspace pages as NX for the kernel) are active, as malicious code in userspace may then be executed speculatively. Overwrite the CPU's return prediction stack with calls which are predicted to return to an infinite loop, to "capture" speculation if this happens. This is required both for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI. On Skylake+ the problem is slightly different, and an *underflow* of the RSB may cause errant branch predictions to occur. So there it's not so much overwrite, as *filling* the RSB to attempt to prevent it getting empty. This is only a partial solution for Skylake+ since there are many other conditions which may result in the RSB becoming empty. The full solution on Skylake+ is to use IBRS, which will prevent the problem even when the RSB becomes empty. With IBRS, the RSB-stuffing will not be required on context switch. [ tglx: Added missing vendor check and slighty massaged comments and changelog ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 11 +++++++++++ arch/x86/entry/entry_64.S | 11 +++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a1f28a54f23a..60c4c342316c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -244,6 +244,17 @@ ENTRY(__switch_to_asm) movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popl %esi popl %edi diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 59874bc1aed2..d54a0ede61d1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -487,6 +487,17 @@ ENTRY(__switch_to_asm) movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popq %r15 popq %r14 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f275447862f4..aa09559b2c0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,6 +211,7 @@ #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e4dc26185aa7..390b3dc3d438 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -23,6 +23,7 @@ #include #include #include +#include static void __init spectre_v2_select_mitigation(void); @@ -155,6 +156,23 @@ disable: return SPECTRE_V2_CMD_NONE; } +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -213,6 +231,24 @@ retpoline_auto: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If neither SMEP or KPTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_PTI) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } } #undef pr_fmt -- cgit v1.2.3 From 1dde7415e99933bb7293d6b2843752cbdb43ec11 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 27 Jan 2018 16:24:33 +0000 Subject: x86/retpoline: Simplify vmexit_fill_RSB() Simplify it to call an asm-function instead of pasting 41 insn bytes at every call site. Also, add alignment to the macro as suggested here: https://support.google.com/faqs/answer/7625886 [dwmw2: Clean up comments, let it clobber %ebx and just tell the compiler] Signed-off-by: Borislav Petkov Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1517070274-12128-3-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 3 +- arch/x86/entry/entry_64.S | 3 +- arch/x86/include/asm/asm-prototypes.h | 3 ++ arch/x86/include/asm/nospec-branch.h | 70 ++++------------------------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 56 ++++++++++++++++++++++++++++ 6 files changed, 71 insertions(+), 65 deletions(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 60c4c342316c..2a35b1e0fb90 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -252,7 +252,8 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + /* Clobbers %ebx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 63f4320602a3..b4f00984089e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -495,7 +495,8 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + /* Clobbers %rbx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 1908214b9125..4d111616524b 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -38,4 +38,7 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) +asmlinkage void __fill_rsb(void); +asmlinkage void __clear_rsb(void); + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 19ecb5446b30..df4ececa6ebf 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,50 +7,6 @@ #include #include -/* - * Fill the CPU return stack buffer. - * - * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; lfence; jmp' loop to capture speculative execution. - * - * This is required in various cases for retpoline and IBRS-based - * mitigations for the Spectre variant 2 vulnerability. Sometimes to - * eliminate potentially bogus entries from the RSB, and sometimes - * purely to ensure that it doesn't get empty, which on some CPUs would - * allow predictions from other (unwanted!) sources to be used. - * - * We define a CPP macro such that it can be used from both .S files and - * inline assembly. It's possible to do a .macro and then include that - * from C via asm(".include ") but let's not go there. - */ - -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ -#define RSB_FILL_LOOPS 16 /* To avoid underflow */ - -/* - * Google experimented with loop-unrolling and this turned out to be - * the optimal version — two calls, each with their own speculation - * trap should their return address end up getting used, in a loop. - */ -#define __FILL_RETURN_BUFFER(reg, nr, sp) \ - mov $(nr/2), reg; \ -771: \ - call 772f; \ -773: /* speculation trap */ \ - pause; \ - lfence; \ - jmp 773b; \ -772: \ - call 774f; \ -775: /* speculation trap */ \ - pause; \ - lfence; \ - jmp 775b; \ -774: \ - dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; - #ifdef __ASSEMBLY__ /* @@ -121,17 +77,10 @@ #endif .endm - /* - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP - * monstrosity above, manually. - */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +/* This clobbers the BX register */ +.macro FILL_RETURN_BUFFER nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr -.Lskip_rsb_\@: + ALTERNATIVE "", "call __clear_rsb", \ftr #endif .endm @@ -206,15 +155,10 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - unsigned long loops; - - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE("jmp 910f", - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), - X86_FEATURE_RETPOLINE) - "910:" - : "=r" (loops), ASM_CALL_CONSTRAINT - : : "memory" ); + alternative_input("", + "call __fill_rsb", + X86_FEATURE_RETPOLINE, + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); #endif } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d435c89875c1..d0a3170e6804 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_RETPOLINE) += retpoline.o +OBJECT_FILES_NON_STANDARD_retpoline.o :=y obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index c909961e678a..480edc3a5e03 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,6 +7,7 @@ #include #include #include +#include .macro THUNK reg .section .text.__x86.indirect_thunk @@ -46,3 +47,58 @@ GENERATE_THUNK(r13) GENERATE_THUNK(r14) GENERATE_THUNK(r15) #endif + +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; lfence; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * Google experimented with loop-unrolling and this turned out to be + * the optimal version - two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +.macro STUFF_RSB nr:req sp:req + mov $(\nr / 2), %_ASM_BX + .align 16 +771: + call 772f +773: /* speculation trap */ + pause + lfence + jmp 773b + .align 16 +772: + call 774f +775: /* speculation trap */ + pause + lfence + jmp 775b + .align 16 +774: + dec %_ASM_BX + jnz 771b + add $((BITS_PER_LONG/8) * \nr), \sp +.endm + +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +ENTRY(__fill_rsb) + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP + ret +END(__fill_rsb) +EXPORT_SYMBOL_GPL(__fill_rsb) + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ + +ENTRY(__clear_rsb) + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP + ret +END(__clear_rsb) +EXPORT_SYMBOL_GPL(__clear_rsb) -- cgit v1.2.3 From 93286261de1b46339aa27cd4c639b21778f6cade Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:33 +0100 Subject: x86/hyperv: Reenlightenment notifications support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V supports Live Migration notification. This is supposed to be used in conjunction with TSC emulation: when a VM is migrated to a host with different TSC frequency for some short period the host emulates the accesses to TSC and sends an interrupt to notify about the event. When the guest is done updating everything it can disable TSC emulation and everything will start working fast again. These notifications weren't required until now as Hyper-V guests are not supposed to use TSC as a clocksource: in Linux the TSC is even marked as unstable on boot. Guests normally use 'tsc page' clocksource and host updates its values on migrations automatically. Things change when with nested virtualization: even when the PV clocksources (kvm-clock or tsc page) are passed through to the nested guests the TSC frequency and frequency changes need to be know.. Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that the fix in on the way. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-4-vkuznets@redhat.com --- arch/x86/entry/entry_32.S | 3 ++ arch/x86/entry/entry_64.S | 3 ++ arch/x86/hyperv/hv_init.c | 89 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/irq_vectors.h | 7 ++- arch/x86/include/asm/mshyperv.h | 9 ++++ arch/x86/include/uapi/asm/hyperv.h | 27 ++++++++++++ arch/x86/kernel/cpu/mshyperv.c | 6 +++ 7 files changed, 143 insertions(+), 1 deletion(-) (limited to 'arch/x86/entry/entry_32.S') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2a35b1e0fb90..7a796eeddf99 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -895,6 +895,9 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, hyperv_vector_handler) +BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_intr) + #endif /* CONFIG_HYPERV */ ENTRY(page_fault) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a83570495162..553aa49909ce 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1245,6 +1245,9 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler + +apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ + hyperv_reenlightenment_vector hyperv_reenlightenment_intr #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a6c63f721bc..712ac40081f7 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -18,6 +18,8 @@ */ #include +#include +#include #include #include #include @@ -102,6 +104,93 @@ static int hv_cpu_init(unsigned int cpu) return 0; } +static void (*hv_reenlightenment_cb)(void); + +static void hv_reenlightenment_notify(struct work_struct *dummy) +{ + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + /* Don't issue the callback if TSC accesses are not emulated */ + if (hv_reenlightenment_cb && emu_status.inprogress) + hv_reenlightenment_cb(); +} +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); + +void hyperv_stop_tsc_emulation(void) +{ + u64 freq; + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + emu_status.inprogress = 0; + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + tsc_khz = div64_u64(freq, 1000); +} +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); + +static inline bool hv_reenlightenment_available(void) +{ + /* + * Check for required features and priviliges to make TSC frequency + * change notifications work. + */ + return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && + ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; +} + +__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +{ + entering_ack_irq(); + + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); + + exiting_irq(); +} + +void set_hv_tscchange_cb(void (*cb)(void)) +{ + struct hv_reenlightenment_control re_ctrl = { + .vector = HYPERV_REENLIGHTENMENT_VECTOR, + .enabled = 1, + .target_vp = hv_vp_index[smp_processor_id()] + }; + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; + + if (!hv_reenlightenment_available()) { + pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + return; + } + + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ + wmb(); + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); +} +EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); + +void clear_hv_tscchange_cb(void) +{ + struct hv_reenlightenment_control re_ctrl; + + if (!hv_reenlightenment_available()) + return; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + + hv_reenlightenment_cb = NULL; +} +EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 67421f649cfa..e71c1120426b 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -103,7 +103,12 @@ #endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -#define LOCAL_TIMER_VECTOR 0xee + +#if IS_ENABLED(CONFIG_HYPERV) +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#endif + +#define LOCAL_TIMER_VECTOR 0xed #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b1d4ea78270..1790002a2052 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -160,6 +160,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) void hyperv_callback_vector(void); +void hyperv_reenlightenment_vector(void); #ifdef CONFIG_TRACING #define trace_hyperv_callback_vector hyperv_callback_vector #endif @@ -316,11 +317,19 @@ void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs, long err); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); + +void hyperv_reenlightenment_intr(struct pt_regs *regs); +void set_hv_tscchange_cb(void (*cb)(void)); +void clear_hv_tscchange_cb(void); +void hyperv_stop_tsc_emulation(void); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline bool hv_is_hypercall_page_setup(void) { return false; } static inline void hyperv_cleanup(void) {} static inline void hyperv_setup_mmu_ops(void) {} +static inline void set_hv_tscchange_cb(void (*cb)(void)) {} +static inline void clear_hv_tscchange_cb(void) {} +static inline void hyperv_stop_tsc_emulation(void) {}; #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 1a5bfead93b4..197c2e6c7376 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -40,6 +40,9 @@ */ #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) +/* AccessReenlightenmentControls privilege */ +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) + /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available @@ -234,6 +237,30 @@ #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 + +struct hv_reenlightenment_control { + u64 vector:8; + u64 reserved1:8; + u64 enabled:1; + u64 reserved2:15; + u64 target_vp:32; +}; + +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 + +struct hv_tsc_emulation_control { + u64 enabled:1; + u64 reserved:63; +}; + +struct hv_tsc_emulation_status { + u64 inprogress:1; + u64 reserved:63; +}; + #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 85eb5fc180c8..9340f41ce8d3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void) hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_vector); + #endif } -- cgit v1.2.3