From b93590901a01a6d036b3b7c856bcc5724fdb9911 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:51 -0700 Subject: x86_64/vsyscall: Move all of the gate_area code to vsyscall_64.c This code exists for the sole purpose of making the vsyscall page look sort of like real userspace memory. Move it so that it lives with the rest of the vsyscall code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a7ee266773671a05f00b7175ca65a0dd812d2e4b.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 49 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/init_64.c | 49 ------------------------------------------- 2 files changed, 49 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 957779f4eb40..521d5ed19547 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -283,6 +283,55 @@ sigsegv: return true; } +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + /* * Assume __initcall executes before all user space. Hopefully kmod * doesn't violate that. We'll find out if it does. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868fc..dd9ca9becc0f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1193,55 +1193,6 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - static unsigned long probe_memory_block_size(void) { /* start from 2g */ -- cgit v1.2.3 From d4f829dd9026797bd5db8715a30192f23b22afaa Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:52 -0700 Subject: x86_64/vdso: Move getcpu code from vsyscall_64.c to vdso/vma.c This is pure cut-and-paste. At this point, vsyscall_64.c contains only code needed for vsyscall emulation, but some of the comments and function names are still confused. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a244daf7d3cbe71afc08ad09fdfe1866ca1f1978.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 57 ---------------------------------------- arch/x86/vdso/vma.c | 61 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 521d5ed19547..2f9ef0c1d112 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -332,49 +332,6 @@ int in_gate_area_no_mm(unsigned long addr) return (addr & PAGE_MASK) == VSYSCALL_ADDR; } -/* - * Assume __initcall executes before all user space. Hopefully kmod - * doesn't violate that. We'll find out if it does. - */ -static void vsyscall_set_cpu(int cpu) -{ - unsigned long d; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded quickly - * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) - */ - d = 0x0f40000000000ULL; - d |= cpu; - d |= (node & 0xf) << 12; - d |= (node >> 4) << 48; - - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -} - -static void cpu_vsyscall_init(void *arg) -{ - /* preemption should be already off */ - vsyscall_set_cpu(raw_smp_processor_id()); -} - -static int -cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) -{ - long cpu = (long)arg; - - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); - - return NOTIFY_DONE; -} - void __init map_vsyscall(void) { extern char __vsyscall_page; @@ -387,17 +344,3 @@ void __init map_vsyscall(void) BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } - -static int __init vsyscall_init(void) -{ - cpu_notifier_register_begin(); - - on_each_cpu(cpu_vsyscall_init, NULL, 1); - /* notifier priority > KVM */ - __hotcpu_notifier(cpu_vsyscall_notifier, 30); - - cpu_notifier_register_done(); - - return 0; -} -__initcall(vsyscall_init); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 970463b566cf..a155dca5edb5 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -10,12 +10,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; @@ -238,3 +240,62 @@ static __init int vdso_setup(char *s) } __setup("vdso=", vdso_setup); #endif + +#ifdef CONFIG_X86_64 +/* + * Assume __initcall executes before all user space. Hopefully kmod + * doesn't violate that. We'll find out if it does. + */ +static void vsyscall_set_cpu(int cpu) +{ + unsigned long d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded quickly + * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) + */ + d = 0x0f40000000000ULL; + d |= cpu; + d |= (node & 0xf) << 12; + d |= (node >> 4) << 48; + + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static void cpu_vsyscall_init(void *arg) +{ + /* preemption should be already off */ + vsyscall_set_cpu(raw_smp_processor_id()); +} + +static int +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + + return NOTIFY_DONE; +} + +static int __init vsyscall_init(void) +{ + cpu_notifier_register_begin(); + + on_each_cpu(cpu_vsyscall_init, NULL, 1); + /* notifier priority > KVM */ + __hotcpu_notifier(cpu_vsyscall_notifier, 30); + + cpu_notifier_register_done(); + + return 0; +} +__initcall(vsyscall_init); +#endif -- cgit v1.2.3 From 258801563ba1be05793f2417ae30557b43a24e4e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:53 -0700 Subject: x86/vdso: Change the PER_CPU segment to use struct desc_struct This makes it easier to see what's going on. It produces exactly the same segment descriptor as the old code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/d492f7b55136cbc60f016adae79160707b2e03b7.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/vdso/vma.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index a155dca5edb5..261b1349acc9 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -248,7 +248,7 @@ __setup("vdso=", vdso_setup); */ static void vsyscall_set_cpu(int cpu) { - unsigned long d; + struct desc_struct d; unsigned long node = 0; #ifdef CONFIG_NUMA node = cpu_to_node(cpu); @@ -257,13 +257,18 @@ static void vsyscall_set_cpu(int cpu) write_rdtscp_aux((node << 12) | cpu); /* - * Store cpu number in limit so that it can be loaded quickly - * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) + * Store cpu number in limit so that it can be loaded + * quickly in user space in vgetcpu. (12 bits for the CPU + * and 8 bits for the node) */ - d = 0x0f40000000000ULL; - d |= cpu; - d |= (node & 0xf) << 12; - d |= (node >> 4) << 48; + d = (struct desc_struct) { + .limit0 = cpu | ((node & 0xf) << 12), + .limit = node >> 4, + .type = 4, /* RO data, expand down */ + .dpl = 3, /* Visible to user code */ + .s = 1, /* Not a system segment */ + .p = 1, /* Present */ + }; write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } -- cgit v1.2.3 From 9c0080ef93bce34db130db6dc3bd946348e6a203 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:54 -0700 Subject: x86/vdso: Make the PER_CPU segment start out accessed The first userspace attempt to read or write the PER_CPU segment will write the accessed bit to the GDT. This is visible to userspace using the LAR instruction, and it also pointlessly dirties a cache line. Set the segment's accessed bit at boot to prevent userspace access to segments from having side effects. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/ac63814ca4c637a08ec2fd0360d67ca67560a9ee.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/vdso/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 261b1349acc9..0c7997467be0 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -264,7 +264,7 @@ static void vsyscall_set_cpu(int cpu) d = (struct desc_struct) { .limit0 = cpu | ((node & 0xf) << 12), .limit = node >> 4, - .type = 4, /* RO data, expand down */ + .type = 5, /* RO data, expand down, accessed */ .dpl = 3, /* Visible to user code */ .s = 1, /* Not a system segment */ .p = 1, /* Present */ -- cgit v1.2.3 From 287e013108a104e459e675432cf20087feab2b67 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:55 -0700 Subject: x86/vdso: Make the PER_CPU segment 32 bits IMO users ought not to be able to use 16-bit segments without using modify_ldt. Fortunately, it's impossible to break espfix64 by loading the PER_CPU segment into SS because it's PER_CPU is marked read-only and SS cannot contain an RO segment, but marking PER_CPU as 32-bit is less fragile. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/179f490d659307873eefd09206bebd417e2ab5ad.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/vdso/vma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 0c7997467be0..32ca60c8157b 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -268,6 +268,7 @@ static void vsyscall_set_cpu(int cpu) .dpl = 3, /* Visible to user code */ .s = 1, /* Not a system segment */ .p = 1, /* Present */ + .d = 1, /* 32-bit */ }; write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -- cgit v1.2.3 From 61a492fb1759f3e892ad0408e36d3575c5f890d0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:56 -0700 Subject: x86_64/vdso: Remove jiffies from the vvar page I think that the jiffies vvar was once used for the vgetcpu cache. That code is long gone, so let's just make jiffies be a normal variable. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/fcfee6f8749af14d96373a9e2656354ad0b95499.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vvar.h | 1 - arch/x86/kernel/time.c | 2 +- arch/x86/vdso/vgetcpu.c | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 5d2b9ad2c6d2..5f6d40734a3b 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -44,7 +44,6 @@ extern char __vvar_page; /* DECLARE_VVAR(offset, type, name) */ -DECLARE_VVAR(0, volatile unsigned long, jiffies) DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 0fa29609b2c4..25adc0e16eaa 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include #ifdef CONFIG_X86_64 -__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; +__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 2f94b039e55b..add1d98d2477 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 1c0c1b93df4dad43b8050db005bb1c03bc7e09bf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 23 Sep 2014 10:50:57 -0700 Subject: x86_64/vdso: Clean up vgetcpu init and merge the vdso initcalls Now vdso/vma.c has a single initcall and no references to "vsyscall". Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/945c463e2804fedd8b08d63a040cbe85d55195aa.1411494540.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/vdso/vma.c | 54 ++++++++++++++++++----------------------------------- 1 file changed, 18 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 32ca60c8157b..a280b11e2122 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -1,7 +1,8 @@ /* - * Set up the VMAs to tell the VM about the vDSO. * Copyright 2007 Andi Kleen, SUSE Labs. * Subject to the GPL, v.2 + * + * This contains most of the x86 vDSO kernel-side code. */ #include #include @@ -11,18 +12,16 @@ #include #include #include -#include #include #include #include +#include #include #include #include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; - -extern unsigned short vdso_sync_cpuid; #endif void __init init_vdso_image(const struct vdso_image *image) @@ -40,20 +39,6 @@ void __init init_vdso_image(const struct vdso_image *image) image->alt_len)); } -#if defined(CONFIG_X86_64) -static int __init init_vdso(void) -{ - init_vdso_image(&vdso_image_64); - -#ifdef CONFIG_X86_X32_ABI - init_vdso_image(&vdso_image_x32); -#endif - - return 0; -} -subsys_initcall(init_vdso); -#endif - struct linux_binprm; /* Put the vdso above the (randomized) stack with another randomized offset. @@ -242,12 +227,9 @@ __setup("vdso=", vdso_setup); #endif #ifdef CONFIG_X86_64 -/* - * Assume __initcall executes before all user space. Hopefully kmod - * doesn't violate that. We'll find out if it does. - */ -static void vsyscall_set_cpu(int cpu) +static void vgetcpu_cpu_init(void *arg) { + int cpu = smp_processor_id(); struct desc_struct d; unsigned long node = 0; #ifdef CONFIG_NUMA @@ -274,34 +256,34 @@ static void vsyscall_set_cpu(int cpu) write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } -static void cpu_vsyscall_init(void *arg) -{ - /* preemption should be already off */ - vsyscall_set_cpu(raw_smp_processor_id()); -} - static int -cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); return NOTIFY_DONE; } -static int __init vsyscall_init(void) +static int __init init_vdso(void) { + init_vdso_image(&vdso_image_64); + +#ifdef CONFIG_X86_X32_ABI + init_vdso_image(&vdso_image_x32); +#endif + cpu_notifier_register_begin(); - on_each_cpu(cpu_vsyscall_init, NULL, 1); + on_each_cpu(vgetcpu_cpu_init, NULL, 1); /* notifier priority > KVM */ - __hotcpu_notifier(cpu_vsyscall_notifier, 30); + __hotcpu_notifier(vgetcpu_cpu_notifier, 30); cpu_notifier_register_done(); return 0; } -__initcall(vsyscall_init); -#endif +subsys_initcall(init_vdso); +#endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From a92f101bc99d17b75021cf29f18cc57f39a37d1f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 1 Nov 2014 21:18:26 +0100 Subject: x86: vdso: Fix build with older gcc gcc-4.4.4: arch/x86/vdso/vma.c: In function 'vgetcpu_cpu_init': arch/x86/vdso/vma.c:247: error: unknown field 'limit0' specified in initializer arch/x86/vdso/vma.c:247: warning: missing braces around initializer arch/x86/vdso/vma.c:247: warning: (near initialization for '(anonymous).') arch/x86/vdso/vma.c:248: error: unknown field 'limit' specified in initializer arch/x86/vdso/vma.c:248: warning: excess elements in struct initializer arch/x86/vdso/vma.c:248: warning: (near initialization for '(anonymous)') .... I couldn't find any way of tricking it into accepting an initializer format :( Reported-by: Boris Ostrovsky Fixes: 258801563b ("x86/vdso: Change the PER_CPU segment to use struct desc_struct") Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Thomas Gleixner --- arch/x86/vdso/vma.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index a280b11e2122..009495b9ab4b 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -230,7 +230,7 @@ __setup("vdso=", vdso_setup); static void vgetcpu_cpu_init(void *arg) { int cpu = smp_processor_id(); - struct desc_struct d; + struct desc_struct d = { }; unsigned long node = 0; #ifdef CONFIG_NUMA node = cpu_to_node(cpu); @@ -243,15 +243,13 @@ static void vgetcpu_cpu_init(void *arg) * quickly in user space in vgetcpu. (12 bits for the CPU * and 8 bits for the node) */ - d = (struct desc_struct) { - .limit0 = cpu | ((node & 0xf) << 12), - .limit = node >> 4, - .type = 5, /* RO data, expand down, accessed */ - .dpl = 3, /* Visible to user code */ - .s = 1, /* Not a system segment */ - .p = 1, /* Present */ - .d = 1, /* 32-bit */ - }; + d.limit0 = cpu | ((node & 0xf) << 12); + d.limit = node >> 4; + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } -- cgit v1.2.3 From e76b027e6408f5570dc940b731ec9ae870c6188a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Oct 2014 14:58:01 -0700 Subject: x86,vdso: Use LSL unconditionally for vgetcpu LSL is faster than RDTSCP and works everywhere; there's no need to switch between them depending on CPU. Signed-off-by: Andy Lutomirski Cc: Andi Kleen Link: http://lkml.kernel.org/r/72f73d5ec4514e02bba345b9759177ef03742efb.1414706021.git.luto@amacapital.net Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/vgtod.h | 19 +++++++++++++++++++ arch/x86/include/asm/vsyscall.h | 29 ----------------------------- arch/x86/include/asm/vvar.h | 1 - arch/x86/kernel/cpu/common.c | 10 ---------- arch/x86/kernel/vsyscall_64.c | 2 -- arch/x86/vdso/vgetcpu.c | 1 - 6 files changed, 19 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3c3366c2e37f..e7e9682a33e9 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -70,4 +70,23 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s) ++s->seq; } +#ifdef CONFIG_X86_64 + +#define VGETCPU_CPU_MASK 0xfff + +static inline unsigned int __getcpu(void) +{ + unsigned int p; + + /* + * Load per CPU data from GDT. LSL is faster than RDTSCP and + * works on all CPUs. + */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + + return p; +} + +#endif /* CONFIG_X86_64 */ + #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 2a46ca720afc..34f7d8857542 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,15 +4,6 @@ #include #include -#define VGETCPU_RDTSCP 1 -#define VGETCPU_LSL 2 - -/* kernel space (writeable) */ -extern int vgetcpu_mode; -extern struct timezone sys_tz; - -#include - extern void map_vsyscall(void); /* @@ -21,24 +12,4 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); -#ifdef CONFIG_X86_64 - -#define VGETCPU_CPU_MASK 0xfff - -static inline unsigned int __getcpu(void) -{ - unsigned int p; - - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - - return p; -} -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 5f6d40734a3b..3f32dfc2ab73 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -44,7 +44,6 @@ extern char __vvar_page; /* DECLARE_VVAR(offset, type, name) */ -DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b4f78c9ba19..175372b854be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -956,14 +956,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_64 -static void vgetcpu_set_mode(void) -{ - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; -} - #ifdef CONFIG_IA32_EMULATION /* May not be __init: called during resume */ static void syscall32_cpu_init(void) @@ -1006,8 +998,6 @@ void __init identify_boot_cpu(void) #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); -#else - vgetcpu_set_mode(); #endif cpu_detect_tlb(&boot_cpu_data); } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 2f9ef0c1d112..419e83b58436 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -52,8 +52,6 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -DEFINE_VVAR(int, vgetcpu_mode); - static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static int __init vsyscall_setup(char *str) diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index add1d98d2477..8ec3d1f4ce9a 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -8,7 +8,6 @@ #include #include #include -#include #include notrace long -- cgit v1.2.3 From 87983c66bc02c9cd8e4a42e7924435145d52bb13 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 29 Oct 2014 14:33:45 -0700 Subject: x86_64, vsyscall: Turn vsyscalls all the way off when vsyscall==none I see no point in having an unusable read-only page sitting at 0xffffffffff600000 when vsyscall=none. Instead, skip mapping it and remove it from /proc/PID/maps. I kept the ratelimited warning when programs try to use a vsyscall in this mode, since it may help admins avoid confusion. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Triplett Cc: Konrad Rzeszutek Wilk Link: http://lkml.kernel.org/r/0dddbadc1d4e3bfbaf887938ff42afc97a7cc1f2.1414618407.git.luto@amacapital.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 419e83b58436..2d912629c96e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -307,6 +307,8 @@ struct vm_area_struct *get_gate_vma(struct mm_struct *mm) if (!mm || mm->context.ia32_compat) return NULL; #endif + if (vsyscall_mode == NONE) + return NULL; return &gate_vma; } @@ -327,7 +329,7 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) */ int in_gate_area_no_mm(unsigned long addr) { - return (addr & PAGE_MASK) == VSYSCALL_ADDR; + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } void __init map_vsyscall(void) @@ -335,10 +337,12 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } -- cgit v1.2.3 From 95c46b56922409ed8838b3b420b11cfebb8c6c88 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 29 Oct 2014 14:33:46 -0700 Subject: x86_64, vsyscall: Rewrite comment and clean up headers in vsyscall code vsyscall_64.c is just vsyscall emulation. Tidy it up accordingly. [ tglx: Preserved the original copyright notices ] Signed-off-by: Andy Lutomirski Reviewed-by: Josh Triplett Cc: Konrad Rzeszutek Wilk Link: http://lkml.kernel.org/r/9c448d5643d0fdb618f8cde9a54c21d2bcd486ce.1414618407.git.luto@amacapital.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 50 ++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 2d912629c96e..7d9eb4bc10ac 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -1,52 +1,38 @@ /* + * Copyright (c) 2012-2014 Andy Lutomirski + * + * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * - * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. * - * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 4 - * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. We cannot add more with this - * mechanism because older kernels won't return -ENOSYS. + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. * - * Note: the concept clashes with user mode linux. UML users should - * use the vDSO. + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include #include #define CREATE_TRACE_POINTS -- cgit v1.2.3 From 1ad83c858c7d4ea210429142c99a1548e6715a35 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 29 Oct 2014 14:33:47 -0700 Subject: x86_64,vsyscall: Make vsyscall emulation configurable This adds CONFIG_X86_VSYSCALL_EMULATION, guarded by CONFIG_EXPERT. Turning it off completely disables vsyscall emulation, saving ~3.5k for vsyscall_64.c, 4k for vsyscall_emu_64.S (the fake vsyscall page), some tiny amount of core mm code that supports a gate area, and possibly 4k for a wasted pagetable. The latter is because the vsyscall addresses are misaligned and fit poorly in the fixmap. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Triplett Cc: Konrad Rzeszutek Wilk Link: http://lkml.kernel.org/r/406db88b8dd5f0cbbf38216d11be34bbb43c7eae.1414618407.git.luto@amacapital.net Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 18 ++++++++++++++++++ arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/include/asm/page_64.h | 4 +++- arch/x86/include/asm/vsyscall.h | 8 ++++++++ arch/x86/kernel/Makefile | 3 +-- arch/x86/kernel/setup.c | 2 -- arch/x86/xen/mmu.c | 6 ++++-- 7 files changed, 36 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2327e88e07c..cd10436d7d1c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -984,6 +984,24 @@ config X86_ESPFIX64 def_bool y depends on X86_16BIT && X86_64 +config X86_VSYSCALL_EMULATION + bool "Enable vsyscall emulation" if EXPERT + default y + depends on X86_64 + ---help--- + This enables emulation of the legacy vsyscall page. Disabling + it is roughly equivalent to booting with vsyscall=none, except + that it will also disable the helpful warning if a program + tries to use a vsyscall. With this option set to N, offending + programs will just segfault, citing addresses of the form + 0xffffffffff600?00. + + This option is required by many programs built before 2013, and + care should be used even with newer programs if set to N. + + Disabling this option saves about 7K of kernel size and + possibly 4K of additional runtime pagetable memory. + config TOSHIBA tristate "Toshiba Laptop support" depends on X86_32 diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index ffb1733ac91f..d8d5bcb2a0b5 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -69,7 +69,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, #else +#ifdef CONFIG_X86_VSYSCALL_EMULATION VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, +#endif #ifdef CONFIG_PARAVIRT_CLOCK PVCLOCK_FIXMAP_BEGIN, PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index f408caf73430..b3bebf9e5746 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,6 +39,8 @@ void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ -#define __HAVE_ARCH_GATE_AREA 1 +#ifdef CONFIG_X86_VSYSCALL_EMULATION +# define __HAVE_ARCH_GATE_AREA 1 +#endif #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 34f7d8857542..6ba66ee79710 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include #include +#ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); /* @@ -11,5 +12,12 @@ extern void map_vsyscall(void); * Returns true if handled. */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +#else +static inline void map_vsyscall(void) {} +static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + return false; +} +#endif #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f1e77440b2b..5d4502c8b983 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,8 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_X86_64) += vsyscall_64.o -obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 235cfd39e0d7..59a6f884fdad 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1190,9 +1190,7 @@ void __init setup_arch(char **cmdline_p) tboot_probe(); -#ifdef CONFIG_X86_64 map_vsyscall(); -#endif generic_apic_probe(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a8a1a3d08d4d..8906cf0e536f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1457,8 +1457,10 @@ static int xen_pgd_alloc(struct mm_struct *mm) page->private = (unsigned long)user_pgd; if (user_pgd != NULL) { +#ifdef CONFIG_X86_VSYSCALL_EMULATION user_pgd[pgd_index(VSYSCALL_ADDR)] = __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); +#endif ret = 0; } @@ -2021,7 +2023,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # ifdef CONFIG_HIGHMEM case FIX_KMAP_BEGIN ... FIX_KMAP_END: # endif -#else +#elif defined(CONFIG_X86_VSYSCALL_EMULATION) case VSYSCALL_PAGE: #endif case FIX_TEXT_POKE0: @@ -2060,7 +2062,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) __native_set_fixmap(idx, pte); -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_VSYSCALL_EMULATION /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ if (idx == VSYSCALL_PAGE) { -- cgit v1.2.3 From 07114f0f1cda8b2ef6e884d0c7b268a32cce7903 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 4 Nov 2014 15:46:21 -0800 Subject: x86_64: Add a comment explaining the TASK_SIZE_MAX guard page That guard page is absolutely necessary; explain why for posterity. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/23320cb5017c2da8475ec20fcde8089d82aa2699.1415144745.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eb71ec794732..82d93ea13c0c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -893,7 +893,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #else /* - * User space process size. 47bits minus one guard page. + * User space process size. 47bits minus one guard page. The guard + * page is necessary on Intel CPUs: if a SYSCALL instruction is at + * the highest possible canonical userspace address, then that + * syscall will enter the kernel with a non-canonical return + * address, and SYSRET will explode dangerously. We avoid this + * particular problem by preventing anything from being mapped + * at the maximum canonical address. */ #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) -- cgit v1.2.3 From 26893107aa717cd11010f0c278d02535defa1ac9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 4 Nov 2014 15:36:50 -0800 Subject: x86_64/vsyscall: Restore orig_ax after vsyscall seccomp The vsyscall emulation code sets orig_ax for seccomp's benefit, but it forgot to set it back. I'm not sure that this is observable at all, but it could cause confusion to various /proc or ptrace users, and it's possible that it could cause minor artifacts if a signal were to be delivered on return from vsyscall emulation. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/r/cdc6a564517a4df09235572ee5f530ccdcf933f7.1415144089.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7d9eb4bc10ac..2dcc6ff6fdcc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -206,6 +206,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) "seccomp tried to change syscall nr or ip"); do_exit(SIGSYS); } + regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ -- cgit v1.2.3