From 1bdfac19b3ecfca545281c15c7aea7ebc2eaef31 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:49 -0400 Subject: x86-64: Pad vDSO to a page boundary This avoids an information leak to userspace. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a63380a3c58a0506a2f5a18ba1b12dbde1f25e58.1312378163.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1b979c12ba85..01f5e3b4613c 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -9,6 +9,7 @@ __PAGE_ALIGNED_DATA vdso_start: .incbin "arch/x86/vdso/vdso.so" vdso_end: + .align PAGE_SIZE /* extra data here leaks to userspace. */ .previous -- cgit v1.2.3 From 9c40818da5b39fca236029059ab839857b1ef56c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:50 -0400 Subject: x86-64: Move the "user" vsyscall segment out of the data segment. The kernel's loader doesn't seem to care, but gold complains. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f0716870c297242a841b949953d80c0d87bf3d3f.1312378163.git.luto@mit.edu Reported-by: Arkadiusz Miskiewicz Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4aa9c54a9b76..e79fb3951fce 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -154,6 +154,24 @@ SECTIONS #ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + #define VSYSCALL_ADDR (-10*1024*1024) #define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) @@ -162,7 +180,6 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = ALIGN(4096); __vsyscall_0 = .; . = VSYSCALL_ADDR; @@ -185,23 +202,6 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ - *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ -- cgit v1.2.3 From f670bb760e7d32ec9c690e748a1d5d04921363ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:51 -0400 Subject: x86-64: Work around gold bug 13023 Gold has trouble assigning numbers to the location counter inside of an output section description. The bug was triggered by 9fd67b4ed0714ab718f1f9bd14c344af336a6df7, which consolidated all of the vsyscall sections into a single section. The workaround is IMO still nicer than the old way of doing it. This produces an apparently valid kernel image and passes my vdso tests on both GNU ld version 2.21.51.0.6-2.fc15 20110118 and GNU gold (version 2.21.51.0.6-2.fc15 20110118) 1.10 as distributed by Fedora 15. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/0b260cb806f1f9a25c00ce8377a5f035d57f557a.1312378163.git.luto@mit.edu Reported-by: Arkadiusz Miskiewicz Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e79fb3951fce..8f3a265476d7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -158,10 +158,12 @@ SECTIONS __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) #define __VVAR_KERNEL_LDS #include @@ -184,15 +186,17 @@ SECTIONS . = VSYSCALL_ADDR; .vsyscall : AT(VLOAD(.vsyscall)) { + /* work around gold bug 13023 */ + __vsyscall_beginning_hack = .; *(.vsyscall_0) - . = 1024; + . = __vsyscall_beginning_hack + 1024; *(.vsyscall_1) - . = 2048; + . = __vsyscall_beginning_hack + 2048; *(.vsyscall_2) - . = 4096; /* Pad the whole page. */ + . = __vsyscall_beginning_hack + 4096; /* Pad the whole page. */ } :user =0xcc . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); -- cgit v1.2.3 From 5d5791af4c0d4fd32093882357506355c3357503 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:52 -0400 Subject: x86-64, xen: Enable the vvar mapping Xen needs to handle VVAR_PAGE, introduced in git commit: 9fd67b4ed0714ab718f1f9bd14c344af336a6df7 x86-64: Give vvars their own page Otherwise we die during bootup with a message like: (XEN) mm.c:940:d10 Error getting mfn 1888 (pfn 1e3e48) from L1 entry 8000000001888465 for l1e_owner=10, pg_owner=10 (XEN) mm.c:5049:d10 ptwr_emulate: could not get_page_from_l1e() [ 0.000000] BUG: unable to handle kernel NULL pointer dereference at (null) [ 0.000000] IP: [] xen_set_pte+0x20/0xe0 Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/4659478ed2f3480938f96491c2ecbe2b2e113a23.1312378163.git.luto@mit.edu Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0ccccb67a993..2e78619bc538 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1829,6 +1829,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: + case VVAR_PAGE: #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -1869,7 +1870,8 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #ifdef CONFIG_X86_64 /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || + idx == VVAR_PAGE) { unsigned long vaddr = __fix_to_virt(idx); set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); } -- cgit v1.2.3 From 318f5a2a672152328c9fb4dead504b89ec738a43 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:53 -0400 Subject: x86-64: Add user_64bit_mode paravirt op Three places in the kernel assume that the only long mode CPL 3 selector is __USER_CS. This is not true on Xen -- Xen's sysretq changes cs to the magic value 0xe033. Two of the places are corner cases, but as of "x86-64: Improve vsyscall emulation CS and RIP handling" (c9712944b2a12373cb6ff8059afcfb7e826a6c54), vsyscalls will segfault if called with Xen's extra CS selector. This causes a panic when older init builds die. It seems impossible to make Xen use __USER_CS reliably without taking a performance hit on every system call, so this fixes the tests instead with a new paravirt op. It's a little ugly because ptrace.h can't include paravirt.h. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f4fcb3947340d9e96ce1054a432f183f9da9db83.1312378163.git.luto@mit.edu Reported-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/desc.h | 4 ++-- arch/x86/include/asm/paravirt_types.h | 6 ++++++ arch/x86/include/asm/ptrace.h | 19 +++++++++++++++++++ arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/kernel/step.c | 2 +- arch/x86/kernel/vsyscall_64.c | 6 +----- arch/x86/mm/fault.c | 2 +- arch/x86/xen/enlighten.c | 4 ++++ 8 files changed, 38 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 7b439d9aea2a..41935fadfdfc 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -27,8 +27,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->base2 = (info->base_addr & 0xff000000) >> 24; /* - * Don't allow setting of the lm bit. It is useless anyway - * because 64bit system calls require __USER_CS: + * Don't allow setting of the lm bit. It would confuse + * user_64bit_mode and would get overridden by sysret anyway. */ desc->l = 0; } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 82885099c869..96a0f80407b8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,6 +41,7 @@ #include #include +#include struct page; struct thread_struct; @@ -63,6 +64,11 @@ struct paravirt_callee_save { struct pv_info { unsigned int kernel_rpl; int shared_kernel_pmd; + +#ifdef CONFIG_X86_64 + u16 extra_user_64bit_cs; /* __USER_CS if none */ +#endif + int paravirt_enabled; const char *name; }; diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 94e7618fcac8..35664547125b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -131,6 +131,9 @@ struct pt_regs { #ifdef __KERNEL__ #include +#ifdef CONFIG_PARAVIRT +#include +#endif struct cpuinfo_x86; struct task_struct; @@ -187,6 +190,22 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } +#ifdef CONFIG_X86_64 +static inline bool user_64bit_mode(struct pt_regs *regs) +{ +#ifndef CONFIG_PARAVIRT + /* + * On non-paravirt systems, this is the only long mode CPL 3 + * selector. We do not allow long mode selectors in the LDT. + */ + return regs->cs == __USER_CS; +#else + /* Headers are too twisted for this to go in paravirt.h. */ + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; +#endif +} +#endif + /* * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode * when it traps. The previous stack will be directly underneath the saved diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71b..681f15994218 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -299,6 +299,10 @@ struct pv_info pv_info = { .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = __USER_CS, +#endif }; struct pv_init_ops pv_init_ops = { diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 7977f0cfe339..c346d1161488 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -74,7 +74,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) + if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dda7dff9cef7..1725930a6f9f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -127,11 +127,7 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) local_irq_enable(); - /* - * Real 64-bit user mode code has cs == __USER_CS. Anything else - * is bogus. - */ - if (regs->cs != __USER_CS) { + if (!user_64bit_mode(regs)) { /* * If we trapped from kernel mode, we might as well OOPS now * instead of returning to some random address and OOPSing diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf4c7e5..c1d018238f32 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -105,7 +105,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, * but for now it's good enough to assume that long * mode only uses well known segments or kernel. */ - return (!user_mode(regs)) || (regs->cs == __USER_CS); + return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5525163a0398..78fe33d03565 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -937,6 +937,10 @@ static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", }; -- cgit v1.2.3 From c149a665ac488e0dac22a42287f45ad1bda06ff1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:54 -0400 Subject: x86-64: Add vsyscall:emulate_vsyscall trace event Vsyscall emulation is slow, so make it easy to track down. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/cdaad7da946a80b200df16647c1700db3e1171e9.1312378163.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vsyscall_64.c | 6 ++++++ arch/x86/kernel/vsyscall_trace.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1725930a6f9f..93a0d46d9569 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -50,6 +50,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { @@ -146,6 +149,9 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) * and int 0xcc is two bytes long. */ vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + + trace_emulate_vsyscall(vsyscall_nr); + if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h new file mode 100644 index 000000000000..a8b2edec54fe --- /dev/null +++ b/arch/x86/kernel/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_FILE vsyscall_trace +#include -- cgit v1.2.3 From f3fb5b7bb70d6e679c15fef85707810a067f5fb6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:30 -0400 Subject: x86: Remove unnecessary compile flag tweaks for vsyscall code As of commit 98d0ac38ca7b1b7a552c9a2359174ff84decb600 Author: Andy Lutomirski Date: Thu Jul 14 06:47:22 2011 -0400 x86-64: Move vread_tsc and vread_hpet into the vDSO user code no longer directly calls into code in arch/x86/kernel/, so we don't need compile flag hacks to make it safe. All vdso code is in the vdso directory now. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/835cd05a4c7740544d09723d6ba48f4406f9826c.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/Makefile | 13 ------------- arch/x86/kernel/vsyscall_64.c | 3 --- 2 files changed, 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2deef3d2435a..3d1ac3978707 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,19 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -# -# vsyscalls (which work on the user stack) should have -# no stack-protector checks: -# -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) -CFLAGS_hpet.o := $(nostackp) -CFLAGS_paravirt.o := $(nostackp) -GCOV_PROFILE_vsyscall_64.o := n -GCOV_PROFILE_hpet.o := n -GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_paravirt.o := n - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 93a0d46d9569..bf8e9ffee6e9 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,9 +18,6 @@ * use the vDSO. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include #include #include -- cgit v1.2.3 From fce8dc06423d6fb2709469dc5c55b04e09c1d126 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:31 -0400 Subject: x86-64: Wire up getcpu syscall getcpu is available as a vdso entry and an emulated vsyscall. Programs that for some reason don't want to use the vdso should still be able to call getcpu without relying on the slow emulated vsyscall. It costs almost nothing to expose it as a real syscall. We also need this for the following patch in vsyscall=native mode. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/6b19f55bdb06a0c32c2fa6dba9b6f222e1fde999.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/unistd_64.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 705bf139288c..d92641cc7acc 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -681,6 +681,8 @@ __SYSCALL(__NR_syncfs, sys_syncfs) __SYSCALL(__NR_sendmmsg, sys_sendmmsg) #define __NR_setns 308 __SYSCALL(__NR_setns, sys_setns) +#define __NR_getcpu 309 +__SYSCALL(__NR_getcpu, sys_getcpu) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR -- cgit v1.2.3 From 3ae36655b97a03fa1decf72f04078ef945647c1a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:32 -0400 Subject: x86-64: Rework vsyscall emulation and add vsyscall= parameter There are three choices: vsyscall=native: Vsyscalls are native code that issues the corresponding syscalls. vsyscall=emulate (default): Vsyscalls are emulated by instruction fault traps, tested in the bad_area path. The actual contents of the vsyscall page is the same as the vsyscall=native case except that it's marked NX. This way programs that make assumptions about what the code in the page does will not be confused when they read that code. vsyscall=none: Trying to execute a vsyscall will segfault. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/8449fb3abf89851fd6b2260972666a6f82542284.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 21 ++++++++++ arch/x86/include/asm/irq_vectors.h | 4 -- arch/x86/include/asm/traps.h | 2 - arch/x86/include/asm/vsyscall.h | 6 +++ arch/x86/kernel/entry_64.S | 1 - arch/x86/kernel/traps.c | 6 --- arch/x86/kernel/vmlinux.lds.S | 33 ---------------- arch/x86/kernel/vsyscall_64.c | 79 +++++++++++++++++++++++-------------- arch/x86/kernel/vsyscall_emu_64.S | 36 +++++++++++------ arch/x86/mm/fault.c | 12 ++++++ 10 files changed, 111 insertions(+), 89 deletions(-) (limited to 'arch') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index aa47be71df4c..9cfd6bb9198e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2657,6 +2657,27 @@ bytes respectively. Such letter suffixes can also be entirely omitted. vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: + vsyscall= [X86-64] + Controls the behavior of vsyscalls (i.e. calls to + fixed addresses of 0xffffffffff600x00 from legacy + code). Most statically-linked binaries and older + versions of glibc use these calls. Because these + functions are at fixed addresses, they make nice + targets for exploits that can control RIP. + + emulate [default] Vsyscalls turn into traps and are + emulated reasonably safely. + + native Vsyscalls are native syscall instructions. + This is a little bit faster than trapping + and makes a few dynamic recompilers work + better than they would in emulation mode. + It also makes exploits much easier to write. + + none Vsyscalls don't work at all. This makes + them quite hard to use for exploits but + might break your system. + vt.cur_default= [VT] Default cursor shape. Format: 0xCCBBAA, where AA, BB, and CC are the same as the parameters of the [?A;B;Cc escape sequence; diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index a563c509edcb..2c224e183b52 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,7 +17,6 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vector 204 : legacy x86_64 vsyscall emulation * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * @@ -51,9 +50,6 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 #endif -#ifdef CONFIG_X86_64 -# define VSYSCALL_EMU_VECTOR 0xcc -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2bae0a513b40..0012d0902c5f 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,7 +40,6 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -asmlinkage void emulate_vsyscall(void); dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -67,7 +66,6 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); dotraplinkage void do_machine_check(struct pt_regs *, long); #endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); -dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 60107072c28b..eaea1d31f753 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -27,6 +27,12 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +/* + * Called on instruction fetch fault in vsyscall page. + * Returns true if handled. + */ +extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e949793d6b93..46792d900018 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1123,7 +1123,6 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error -zeroentry emulate_vsyscall do_emulate_vsyscall /* Reload gs selector with exception handling */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a085ca..b9b67166f9de 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,12 +872,6 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif -#ifdef CONFIG_X86_64 - BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); - set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); - set_bit(VSYSCALL_EMU_VECTOR, used_vectors); -#endif - /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8f3a265476d7..0f703f10901a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -71,7 +71,6 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(6); /* RW_ */ #endif @@ -174,38 +173,6 @@ SECTIONS . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#define VSYSCALL_ADDR (-10*1024*1024) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - __vsyscall_0 = .; - - . = VSYSCALL_ADDR; - .vsyscall : AT(VLOAD(.vsyscall)) { - /* work around gold bug 13023 */ - __vsyscall_beginning_hack = .; - *(.vsyscall_0) - - . = __vsyscall_beginning_hack + 1024; - *(.vsyscall_1) - - . = __vsyscall_beginning_hack + 2048; - *(.vsyscall_2) - - . = __vsyscall_beginning_hack + 4096; /* Pad the whole page. */ - } :user =0xcc - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); - -#undef VSYSCALL_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index bf8e9ffee6e9..18ae83dd1cd7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -56,6 +56,27 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + void update_vsyscall_tz(void) { unsigned long flags; @@ -100,7 +121,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->cs, + message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } @@ -118,45 +139,39 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - local_irq_enable(); + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ - if (!user_64bit_mode(regs)) { - /* - * If we trapped from kernel mode, we might as well OOPS now - * instead of returning to some random address and OOPSing - * then. - */ - BUG_ON(!user_mode(regs)); + WARN_ON_ONCE(address != regs->ip); - /* Compat mode and non-compat 32-bit CS should both segfault. */ - warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc from 32-bit mode"); - goto sigsegv; + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; } - /* - * x86-ism here: regs->ip points to the instruction after the int 0xcc, - * and int 0xcc is two bytes long. - */ - vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc (exploit attempt?)"); + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } @@ -201,13 +216,11 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) regs->ip = caller; regs->sp += 8; - local_irq_disable(); - return; + return true; sigsegv: - regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); - local_irq_disable(); + return true; } /* @@ -255,15 +268,21 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) void __init map_vsyscall(void) { - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != + (unsigned long)VSYSCALL_START); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S index ffa845eae5ca..c9596a9af159 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -7,21 +7,31 @@ */ #include + #include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret -/* The unused parts of the page are filled with 0xcc by the linker script. */ + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret -.section .vsyscall_0, "a" -ENTRY(vsyscall_0) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_0) + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret -.section .vsyscall_1, "a" -ENTRY(vsyscall_1) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_1) + .balign 4096, 0xcc -.section .vsyscall_2, "a" -ENTRY(vsyscall_2) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_2) + .size __vsyscall_page, 4096 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c1d018238f32..e58935c25b94 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -720,6 +720,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ + if (unlikely((error_code & PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_START))) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); -- cgit v1.2.3