From 4674fdb9f1493f12a15c5f2158f260060b58953e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 14:49:01 +0100 Subject: arm64: mm: dump: make page table dumping reusable For debugging purposes, it would be nice if we could export page tables other than the swapper_pg_dir to userspace. To enable this, this patch refactors the arm64 page table dumping code such that multiple tables may be registered with the framework, and exported under debugfs. Signed-off-by: Mark Rutland Signed-off-by: Ard Biesheuvel Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/dump.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index ccfde237d6e6..f94b80eb295d 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -27,11 +27,7 @@ #include #include #include - -struct addr_marker { - unsigned long start_address; - const char *name; -}; +#include static const struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN @@ -290,7 +286,8 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) } } -static void walk_pgd(struct pg_state *st, struct mm_struct *mm, unsigned long start) +static void walk_pgd(struct pg_state *st, struct mm_struct *mm, + unsigned long start) { pgd_t *pgd = pgd_offset(mm, 0UL); unsigned i; @@ -309,12 +306,13 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, unsigned long st static int ptdump_show(struct seq_file *m, void *v) { + struct ptdump_info *info = m->private; struct pg_state st = { .seq = m, - .marker = address_markers, + .marker = info->markers, }; - walk_pgd(&st, &init_mm, VA_START); + walk_pgd(&st, info->mm, info->base_addr); note_page(&st, 0, 0, 0); return 0; @@ -322,7 +320,7 @@ static int ptdump_show(struct seq_file *m, void *v) static int ptdump_open(struct inode *inode, struct file *file) { - return single_open(file, ptdump_show, NULL); + return single_open(file, ptdump_show, inode->i_private); } static const struct file_operations ptdump_fops = { @@ -332,7 +330,7 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -static int ptdump_init(void) +int ptdump_register(struct ptdump_info *info, const char *name) { struct dentry *pe; unsigned i, j; @@ -342,8 +340,18 @@ static int ptdump_init(void) for (j = 0; j < pg_level[i].num; j++) pg_level[i].mask |= pg_level[i].bits[j].mask; - pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, - &ptdump_fops); + pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); return pe ? 0 : -ENOMEM; } + +static struct ptdump_info kernel_ptdump_info = { + .mm = &init_mm, + .markers = address_markers, + .base_addr = VA_START, +}; + +static int ptdump_init(void) +{ + return ptdump_register(&kernel_ptdump_info, "kernel_page_tables"); +} device_initcall(ptdump_init); -- cgit v1.2.3 From b67a8b29df7e6410c605c2759707c96512b15578 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 8 Jun 2016 15:53:46 +0800 Subject: arm64: mm: only initialize swiotlb when necessary we only initialize swiotlb when swiotlb_force is true or not all system memory is DMA-able, this trivial optimization saves us 64MB when swiotlb is not necessary. Signed-off-by: Jisheng Zhang Signed-off-by: Catalin Marinas --- arch/arm64/mm/dma-mapping.c | 15 ++++++++++++++- arch/arm64/mm/init.c | 3 ++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index c566ec83719f..46a4157adc17 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,8 @@ #include +static int swiotlb __read_mostly; + static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, bool coherent) { @@ -341,6 +344,13 @@ static int __swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } +static int __swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + if (swiotlb) + return swiotlb_dma_supported(hwdev, mask); + return 1; +} + static struct dma_map_ops swiotlb_dma_ops = { .alloc = __dma_alloc, .free = __dma_free, @@ -354,7 +364,7 @@ static struct dma_map_ops swiotlb_dma_ops = { .sync_single_for_device = __swiotlb_sync_single_for_device, .sync_sg_for_cpu = __swiotlb_sync_sg_for_cpu, .sync_sg_for_device = __swiotlb_sync_sg_for_device, - .dma_supported = swiotlb_dma_supported, + .dma_supported = __swiotlb_dma_supported, .mapping_error = swiotlb_dma_mapping_error, }; @@ -513,6 +523,9 @@ EXPORT_SYMBOL(dummy_dma_ops); static int __init arm64_dma_init(void) { + if (swiotlb_force || max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT)) + swiotlb = 1; + return atomic_pool_init(); } arch_initcall(arm64_dma_init); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d45f8627012c..7d25b4d00677 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -403,7 +403,8 @@ static void __init free_unused_memmap(void) */ void __init mem_init(void) { - swiotlb_init(1); + if (swiotlb_force || max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT)) + swiotlb_init(1); set_max_mapnr(pfn_to_page(max_pfn) - mem_map); -- cgit v1.2.3 From 275f344bec51e9100bae81f3cc8c6940bbfb24c0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 12:33:01 +0100 Subject: arm64: add macro to extract ESR_ELx.EC Several places open-code extraction of the EC field from an ESR_ELx value, in subtly different ways. This is unfortunate duplication and variation, and the precise logic used to extract the field is a distraction. This patch adds a new macro, ESR_ELx_EC(), to extract the EC field from an ESR_ELx value in a consistent fashion. Existing open-coded extractions in core arm64 code are moved over to the new helper. KVM code is left as-is for the moment. Signed-off-by: Mark Rutland Tested-by: Huang Shijie Cc: Dave P Martin Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/esr.h | 1 + arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/fault.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 77eeb2cc648f..f772e15c4766 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -74,6 +74,7 @@ #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) +#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define ESR_ELx_IL (UL(1) << 25) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index d9da2c56e4f8..a4250a59f2b9 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -456,7 +456,7 @@ static const char *esr_class_str[] = { const char *esr_get_class_string(u32 esr) { - return esr_class_str[esr >> ESR_ELx_EC_SHIFT]; + return esr_class_str[ESR_ELx_EC(esr)]; } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 013e2cbe7924..d2c124cbd18c 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -244,7 +244,7 @@ out: static inline int permission_fault(unsigned int esr) { - unsigned int ec = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT; + unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); -- cgit v1.2.3 From 541ec870ef31433018d245614254bd9d810a9ac3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 12:33:03 +0100 Subject: arm64: kill ESR_LNX_EXEC Currently we treat ESR_EL1 bit 24 as software-defined for distinguishing instruction aborts from data aborts, but this bit is architecturally RES0 for instruction aborts, and could be allocated for an arbitrary purpose in future. Additionally, we hard-code the value in entry.S without the mnemonic, making the code difficult to understand. Instead, remove ESR_LNX_EXEC, and distinguish aborts based on the esr, which we already pass to the sole use of ESR_LNX_EXEC. A new helper, is_el0_instruction_abort() is added to make the logic clear. Any instruction aborts taken from EL1 will already have been handled by bad_mode, so we need not handle that case in the helper. For consistency, the existing permission_fault helper is renamed to is_permission_fault, and the return type is changed to bool. There should be no functional changes as the return value was a boolean expression, and the result is only used in another boolean expression. Signed-off-by: Mark Rutland Cc: Dave P Martin Cc: Huang Shijie Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/fault.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 12e8d2bcb3f9..eefffa81c6df 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -532,7 +532,7 @@ el0_ia: enable_dbg_and_irq ct_user_exit mov x0, x26 - orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts + mov x1, x25 mov x2, sp bl do_mem_abort b ret_to_user diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d2c124cbd18c..fc5a34a72c6d 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -202,8 +202,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADMAP 0x010000 #define VM_FAULT_BADACCESS 0x020000 -#define ESR_LNX_EXEC (1 << 24) - static int __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, struct task_struct *tsk) @@ -242,7 +240,7 @@ out: return fault; } -static inline int permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; @@ -250,6 +248,11 @@ static inline int permission_fault(unsigned int esr) return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } +static bool is_el0_instruction_abort(unsigned int esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -272,14 +275,14 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; - if (esr & ESR_LNX_EXEC) { + if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } - if (permission_fault(esr) && (addr < USER_DS)) { + if (is_permission_fault(esr) && (addr < USER_DS)) { if (get_fs() == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); -- cgit v1.2.3 From ea2cbee3bc671390139802dd0d50b08db024b03c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 22 Jun 2016 12:13:45 +0100 Subject: arm64: mm: simplify memblock numa node extraction We currently open-code extracting the NUMA node of a memblock region, which requires an ifdef to cater for !CONFIG_NUMA builds where the memblock_region::nid field does not exist. The generic memblock_get_region_node helper is intended to cater for this. For CONFIG_HAVE_MEMBLOCK_NODE_MAP, builds this returns reg->nid, and for for !CONFIG_HAVE_MEMBLOCK_NODE_MAP builds this is a static inline that returns 0. Note that for arm64, CONFIG_HAVE_MEMBLOCK_NODE_MAP is selected iff CONFIG_NUMA is. This patch makes use of memblock_get_region_node to simplify the arm64 code. At the same time, we can move the nid variable definition into the loop, as this is the only place it is used. Signed-off-by: Mark Rutland Reviewed-by: Steve Capper Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7d25b4d00677..64ea28306661 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -160,12 +160,10 @@ static void __init arm64_memory_present(void) static void __init arm64_memory_present(void) { struct memblock_region *reg; - int nid = 0; for_each_memblock(memory, reg) { -#ifdef CONFIG_NUMA - nid = reg->nid; -#endif + int nid = memblock_get_region_node(reg); + memory_present(nid, memblock_region_memory_base_pfn(reg), memblock_region_memory_end_pfn(reg)); } -- cgit v1.2.3 From 9fdc14c55cd6579d619ccd9d40982e0805e62b6d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 23 Jun 2016 15:53:17 +0200 Subject: arm64: mm: fix location of _etext As Kees Cook notes in the ARM counterpart of this patch [0]: The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. In particular, Kees is referring to the HARDENED_USERCOPY patch set [1], which rejects attempts to call copy_to_user() on kernel ranges containing executable code, but does allow access to the .rodata segment. Regardless of whether one may or may not agree with the distinction, it makes sense for _etext to have the same meaning across architectures. So let's put _etext where it belongs, between .text and .rodata, and fix up existing references to use __init_begin instead, which unlike _end_rodata includes the exception and notes sections as well. The _etext references in kaslr.c are left untouched, since its references to [_stext, _etext) are meant to capture potential jump instruction targets, and so disregarding .rodata is actually an improvement here. [0] http://article.gmane.org/gmane.linux.kernel/2245084 [1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502 Reported-by: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- arch/arm64/kernel/setup.c | 2 +- arch/arm64/kernel/vmlinux.lds.S | 7 ++++--- arch/arm64/mm/init.c | 4 ++-- arch/arm64/mm/mmu.c | 20 ++++++++++---------- 4 files changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 3279defabaa2..c1509e654190 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -202,7 +202,7 @@ static void __init request_standard_resources(void) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 435e820e898d..0de7be4f1a9d 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -131,12 +131,13 @@ SECTIONS } . = ALIGN(SEGMENT_ALIGN); - RO_DATA(PAGE_SIZE) /* everything from this point to */ - EXCEPTION_TABLE(8) /* _etext will be marked RO NX */ + _etext = .; /* End of text section */ + + RO_DATA(PAGE_SIZE) /* everything from this point to */ + EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ NOTES . = ALIGN(SEGMENT_ALIGN); - _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 64ea28306661..2ade7a6a10a7 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -429,9 +429,9 @@ void __init mem_init(void) pr_cont(" vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n", MLG(VMALLOC_START, VMALLOC_END)); pr_cont(" .text : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(_text, __start_rodata)); + MLK_ROUNDUP(_text, _etext)); pr_cont(" .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(__start_rodata, _etext)); + MLK_ROUNDUP(__start_rodata, __init_begin)); pr_cont(" .init : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(__init_begin, __init_end)); pr_cont(" .data : 0x%p" " - 0x%p" " (%6ld KB)\n", diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0f85a46c3e18..c356f7b84d4d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -386,14 +386,14 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { unsigned long kernel_start = __pa(_text); - unsigned long kernel_end = __pa(_etext); + unsigned long kernel_end = __pa(__init_begin); /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel text */ + /* No overlap with the kernel text/rodata */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -402,7 +402,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel text mapping. + * This block overlaps the kernel text/rodata mappings. * Map the portion(s) which don't overlap. */ if (start < kernel_start) @@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end early_pgtable_alloc); /* - * Map the linear alias of the [_text, _etext) interval as + * Map the linear alias of the [_text, __init_begin) interval as * read-only/non-executable. This makes the contents of the * region accessible to subsystems such as hibernate, but * protects it from inadvertent modification or execution. @@ -449,14 +449,14 @@ void mark_rodata_ro(void) { unsigned long section_size; - section_size = (unsigned long)__start_rodata - (unsigned long)_text; + section_size = (unsigned long)_etext - (unsigned long)_text; create_mapping_late(__pa(_text), (unsigned long)_text, section_size, PAGE_KERNEL_ROX); /* - * mark .rodata as read only. Use _etext rather than __end_rodata to - * cover NOTES and EXCEPTION_TABLE. + * mark .rodata as read only. Use __init_begin rather than __end_rodata + * to cover NOTES and EXCEPTION_TABLE. */ - section_size = (unsigned long)_etext - (unsigned long)__start_rodata; + section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); } @@ -499,8 +499,8 @@ static void __init map_kernel(pgd_t *pgd) { static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; - map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text); - map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata); + map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, &vmlinux_init); map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); -- cgit v1.2.3 From 6c5269f33e96722f3e20dc694fba636cc1d4b3eb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 30 Jun 2016 10:02:04 +0800 Subject: arm64: mm: remove unnecessary BUG_ON The memblock_alloc() and memblock_alloc_base() will panic on their own if no free memory, remove pointless BUG_ON. Signed-off-by: Kefeng Wang Acked-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c356f7b84d4d..0c95d6ec873d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -77,7 +77,6 @@ static phys_addr_t __init early_pgtable_alloc(void) void *ptr; phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - BUG_ON(!phys); /* * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE -- cgit v1.2.3 From 290622efc76ece22ef76a30bf117755891ab27f6 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:28 +0100 Subject: arm64: fix "dc cvau" cache operation on errata-affected core The ARM errata 819472, 826319, 827319 and 824069 for affected Cortex-A53 cores demand to promote "dc cvau" instructions to "dc civac" as well. Attribute the usage of the instruction in __flush_cache_user_range to also be covered by our alternative patching efforts. For that we introduce an assembly macro which both deals with alternatives while still tagging the instructions as USER. Signed-off-by: Andre Przywara Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/alternative.h | 4 ++++ arch/arm64/mm/cache.S | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 7288071195e1..8746ff6abd77 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -133,6 +133,10 @@ void apply_alternatives(void *start, size_t length); #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) +.macro user_alt, label, oldinstr, newinstr, cond +9999: alternative_insn "\oldinstr", "\newinstr", \cond + _ASM_EXTABLE 9999b, \label +.endm /* * Generate the assembly for UAO alternatives with exception table entries. diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 50ff9ba3a236..07d7352d7c38 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -52,7 +52,7 @@ ENTRY(__flush_cache_user_range) sub x3, x2, #1 bic x4, x0, x3 1: -USER(9f, dc cvau, x4 ) // clean D line to PoU +user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE add x4, x4, x2 cmp x4, x1 b.lo 1b -- cgit v1.2.3 From 53e1b32910a3bc94d9f122321442b79b314219f8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Jun 2016 14:51:26 +0200 Subject: arm64: mm: add param to force create_pgd_mapping() to use page mappings Add a bool parameter 'allow_block_mappings' to create_pgd_mapping() and the various helper functions that it descends into, to give the caller control over whether block entries may be used to create the mapping. The UEFI runtime mapping routines will use this to avoid creating block entries that would need to split up into page entries when applying the permissions listed in the Memory Attributes firmware table. This also replaces the block_mappings_allowed() helper function that was added for DEBUG_PAGEALLOC functionality, but the resulting code is functionally equivalent (given that debug_page_alloc does not operate on EFI page table entries anyway) Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu.h | 2 +- arch/arm64/kernel/efi.c | 2 +- arch/arm64/mm/mmu.c | 67 ++++++++++++++++++-------------------------- 3 files changed, 29 insertions(+), 42 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 97b1d8f26b9c..8d9fce037b2f 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -34,7 +34,7 @@ extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void init_mem_pgprot(void); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot); + pgprot_t prot, bool allow_block_mappings); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); #endif diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 78f52488f9ff..981604948521 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -65,7 +65,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, - __pgprot(prot_val | PTE_NG)); + __pgprot(prot_val | PTE_NG), true); return 0; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0c95d6ec873d..a289d66121b6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -155,29 +155,10 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } -#ifdef CONFIG_DEBUG_PAGEALLOC -static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) -{ - - /* - * If debug_page_alloc is enabled we must map the linear map - * using pages. However, other mappings created by - * create_mapping_noalloc must use sections in some cases. Allow - * sections to be used in those cases, where no pgtable_alloc - * function is provided. - */ - return !pgtable_alloc || !debug_pagealloc_enabled(); -} -#else -static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) -{ - return true; -} -#endif - static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void), + bool allow_block_mappings) { pmd_t *pmd; unsigned long next; @@ -208,7 +189,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && - block_mappings_allowed(pgtable_alloc)) { + allow_block_mappings) { pmd_t old_pmd =*pmd; pmd_set_huge(pmd, phys, prot); /* @@ -247,7 +228,8 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void), + bool allow_block_mappings) { pud_t *pud; unsigned long next; @@ -267,8 +249,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys) && - block_mappings_allowed(pgtable_alloc)) { + if (use_1G_block(addr, next, phys) && allow_block_mappings) { pud_t old_pud = *pud; pud_set_huge(pud, phys, prot); @@ -289,7 +270,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, } } else { alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc); + pgtable_alloc, allow_block_mappings); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -303,7 +284,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, */ static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void), + bool allow_block_mappings) { unsigned long addr, length, end, next; @@ -321,7 +303,8 @@ static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc); + alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, + allow_block_mappings); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -339,9 +322,11 @@ static phys_addr_t late_pgtable_alloc(void) static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*alloc)(void)) + phys_addr_t (*alloc)(void), + bool allow_block_mappings) { - init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); + init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc, + allow_block_mappings); } /* @@ -357,16 +342,15 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - NULL); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, true); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot) + pgprot_t prot, bool allow_block_mappings) { __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - late_pgtable_alloc); + late_pgtable_alloc, allow_block_mappings); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -379,7 +363,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - late_pgtable_alloc); + late_pgtable_alloc, !debug_pagealloc_enabled()); } static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) @@ -396,7 +380,8 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, - early_pgtable_alloc); + early_pgtable_alloc, + !debug_pagealloc_enabled()); return; } @@ -408,12 +393,14 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __create_pgd_mapping(pgd, start, __phys_to_virt(start), kernel_start - start, PAGE_KERNEL, - early_pgtable_alloc); + early_pgtable_alloc, + !debug_pagealloc_enabled()); if (kernel_end < end) __create_pgd_mapping(pgd, kernel_end, __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, - early_pgtable_alloc); + early_pgtable_alloc, + !debug_pagealloc_enabled()); /* * Map the linear alias of the [_text, __init_begin) interval as @@ -423,7 +410,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL_RO, - early_pgtable_alloc); + early_pgtable_alloc, !debug_pagealloc_enabled()); } static void __init map_mem(pgd_t *pgd) @@ -480,7 +467,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc); + early_pgtable_alloc, !debug_pagealloc_enabled()); vma->addr = va_start; vma->phys_addr = pa_start; -- cgit v1.2.3 From 4133af6c0419b5a2a4da245ff7af7ceca7fd740d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 29 Jun 2016 14:51:29 +0200 Subject: arm64: mm: Remove split_p*d() functions Since the efi_create_mapping() no longer generates block mappings and being the last user of the split_p*d code, remove these functions and the corresponding TLBI. Signed-off-by: Catalin Marinas [ardb: replace 'overlapping regions' with 'block mappings' in commit log] Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 47 ++++------------------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a289d66121b6..9d2d7cfb95b4 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -96,24 +96,6 @@ static phys_addr_t __init early_pgtable_alloc(void) return phys; } -/* - * remap a PMD into pages - */ -static void split_pmd(pmd_t *pmd, pte_t *pte) -{ - unsigned long pfn = pmd_pfn(*pmd); - int i = 0; - - do { - /* - * Need to have the least restrictive permissions available - * permissions will be fixed up later - */ - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - pfn++; - } while (pte++, i++, i < PTRS_PER_PTE); -} - static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, @@ -121,15 +103,13 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, { pte_t *pte; - if (pmd_none(*pmd) || pmd_sect(*pmd)) { + BUG_ON(pmd_sect(*pmd)); + if (pmd_none(*pmd)) { phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(); pte = pte_set_fixmap(pte_phys); - if (pmd_sect(*pmd)) - split_pmd(pmd, pte); __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); - flush_tlb_all(); pte_clear_fixmap(); } BUG_ON(pmd_bad(*pmd)); @@ -143,18 +123,6 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte_clear_fixmap(); } -static void split_pud(pud_t *old_pud, pmd_t *pmd) -{ - unsigned long addr = pud_pfn(*old_pud) << PAGE_SHIFT; - pgprot_t prot = __pgprot(pud_val(*old_pud) ^ addr); - int i = 0; - - do { - set_pmd(pmd, __pmd(addr | pgprot_val(prot))); - addr += PMD_SIZE; - } while (pmd++, i++, i < PTRS_PER_PMD); -} - static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), @@ -166,20 +134,13 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, /* * Check for initial section mappings in the pgd/pud and remove them. */ - if (pud_none(*pud) || pud_sect(*pud)) { + BUG_ON(pud_sect(*pud)); + if (pud_none(*pud)) { phys_addr_t pmd_phys; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(); pmd = pmd_set_fixmap(pmd_phys); - if (pud_sect(*pud)) { - /* - * need to have the 1G of mappings continue to be - * present - */ - split_pud(pud, pmd); - } __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); - flush_tlb_all(); pmd_clear_fixmap(); } BUG_ON(pud_bad(*pud)); -- cgit v1.2.3 From 40f87d3114b8a1e730ac8f3dc3cf1efe33124776 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Jun 2016 14:51:30 +0200 Subject: arm64: mm: fold init_pgd() into __create_pgd_mapping() The routine __create_pgd_mapping() does nothing except calling init_pgd(), which has no other callers. So fold the latter into the former. Also, drop a comment that has gone stale. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9d2d7cfb95b4..a96a2413fa18 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -239,16 +239,14 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, pud_clear_fixmap(); } -/* - * Create the page directory entries and any necessary page tables for the - * mapping specified by 'md'. - */ -static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) +static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), + bool allow_block_mappings) { unsigned long addr, length, end, next; + pgd_t *pgd = pgd_offset_raw(pgdir, virt); /* * If the virtual and physical address don't have the same offset @@ -280,16 +278,6 @@ static phys_addr_t late_pgtable_alloc(void) return __pa(ptr); } -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, - unsigned long virt, phys_addr_t size, - pgprot_t prot, - phys_addr_t (*alloc)(void), - bool allow_block_mappings) -{ - init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc, - allow_block_mappings); -} - /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the -- cgit v1.2.3 From 16c11325cc44f0614a45e584d439e195059c3f5a Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 1 Jul 2016 17:50:10 +0100 Subject: arm64: mm: change IOMMU notifier action to attach DMA ops Current bus notifier in ARM64 (__iommu_attach_notifier) attempts to attach dma_ops to a device on BUS_NOTIFY_ADD_DEVICE action notification. This will cause issues on ACPI based systems, where PCI devices can be added before the IOMMUs the devices are attached to had a chance to be probed, causing failures on attempts to attach dma_ops in that the domain for the respective IOMMU may not be set-up yet by the time the bus notifier is run. Devices dma_ops do not require to be set-up till the matching device drivers are probed. This means that instead of running the notifier attaching dma_ops to devices (__iommu_attach_notifier) on BUS_NOTIFY_ADD_DEVICE action, it can be run just before the device driver is bound to the device in question (on action BUS_NOTIFY_BIND_DRIVER) so that it is certain that its IOMMU group and domain are set-up accordingly at the time the notifier is triggered. This patch changes the notifier action upon which dma_ops are attached to devices and defer it to driver binding time, so that IOMMU devices have a chance to be probed and to register their bus notifiers before the dma_ops attach sequence for a device is actually carried out. As a result we also no longer need worry about racing with iommu_bus_notifier(), or about retrying the queue in case devices were added too early on DT-based systems, so clean up the notifier itself plus the additional workaround from 722ec35f7fae ("arm64: dma-mapping: fix handling of devices registered before arch_initcall") Acked-by: Will Deacon Cc: Marek Szyprowski Signed-off-by: Lorenzo Pieralisi [rm: get rid of other now-redundant bits] Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas --- arch/arm64/mm/dma-mapping.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 46a4157adc17..f6c55afab3e2 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -861,15 +861,16 @@ static int __iommu_attach_notifier(struct notifier_block *nb, { struct iommu_dma_notifier_data *master, *tmp; - if (action != BUS_NOTIFY_ADD_DEVICE) + if (action != BUS_NOTIFY_BIND_DRIVER) return 0; mutex_lock(&iommu_dma_notifier_lock); list_for_each_entry_safe(master, tmp, &iommu_dma_masters, list) { - if (do_iommu_attach(master->dev, master->ops, - master->dma_base, master->size)) { + if (data == master->dev && do_iommu_attach(master->dev, + master->ops, master->dma_base, master->size)) { list_del(&master->list); kfree(master); + break; } } mutex_unlock(&iommu_dma_notifier_lock); @@ -883,17 +884,8 @@ static int __init register_iommu_dma_ops_notifier(struct bus_type *bus) if (!nb) return -ENOMEM; - /* - * The device must be attached to a domain before the driver probe - * routine gets a chance to start allocating DMA buffers. However, - * the IOMMU driver also needs a chance to configure the iommu_group - * via its add_device callback first, so we need to make the attach - * happen between those two points. Since the IOMMU core uses a bus - * notifier with default priority for add_device, do the same but - * with a lower priority to ensure the appropriate ordering. - */ + nb->notifier_call = __iommu_attach_notifier; - nb->priority = -100; ret = bus_register_notifier(bus, nb); if (ret) { @@ -917,10 +909,6 @@ static int __init __iommu_dma_init(void) if (!ret) ret = register_iommu_dma_ops_notifier(&pci_bus_type); #endif - - /* handle devices queued before this arch_initcall */ - if (!ret) - __iommu_attach_notifier(NULL, BUS_NOTIFY_ADD_DEVICE, NULL); return ret; } arch_initcall(__iommu_dma_init); -- cgit v1.2.3 From 2dd0e8d2d2a157dbc83295a78336c2217110f2f8 Mon Sep 17 00:00:00 2001 From: Sandeepa Prabhu Date: Fri, 8 Jul 2016 12:35:48 -0400 Subject: arm64: Kprobes with single stepping support Add support for basic kernel probes(kprobes) and jump probes (jprobes) for ARM64. Kprobes utilizes software breakpoint and single step debug exceptions supported on ARM v8. A software breakpoint is placed at the probe address to trap the kernel execution into the kprobe handler. ARM v8 supports enabling single stepping before the break exception return (ERET), with next PC in exception return address (ELR_EL1). The kprobe handler prepares an executable memory slot for out-of-line execution with a copy of the original instruction being probed, and enables single stepping. The PC is set to the out-of-line slot address before the ERET. With this scheme, the instruction is executed with the exact same register context except for the PC (and DAIF) registers. Debug mask (PSTATE.D) is enabled only when single stepping a recursive kprobe, e.g.: during kprobes reenter so that probed instruction can be single stepped within the kprobe handler -exception- context. The recursion depth of kprobe is always 2, i.e. upon probe re-entry, any further re-entry is prevented by not calling handlers and the case counted as a missed kprobe). Single stepping from the x-o-l slot has a drawback for PC-relative accesses like branching and symbolic literals access as the offset from the new PC (slot address) may not be ensured to fit in the immediate value of the opcode. Such instructions need simulation, so reject probing them. Instructions generating exceptions or cpu mode change are rejected for probing. Exclusive load/store instructions are rejected too. Additionally, the code is checked to see if it is inside an exclusive load/store sequence (code from Pratyush). System instructions are mostly enabled for stepping, except MSR/MRS accesses to "DAIF" flags in PSTATE, which are not safe for probing. This also changes arch/arm64/include/asm/ptrace.h to use include/asm-generic/ptrace.h. Thanks to Steve Capper and Pratyush Anand for several suggested Changes. Signed-off-by: Sandeepa Prabhu Signed-off-by: David A. Long Signed-off-by: Pratyush Anand Acked-by: Masami Hiramatsu Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/debug-monitors.h | 5 + arch/arm64/include/asm/insn.h | 2 + arch/arm64/include/asm/kprobes.h | 60 ++++ arch/arm64/include/asm/probes.h | 34 +++ arch/arm64/include/asm/ptrace.h | 14 +- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/debug-monitors.c | 16 +- arch/arm64/kernel/probes/Makefile | 1 + arch/arm64/kernel/probes/decode-insn.c | 143 +++++++++ arch/arm64/kernel/probes/decode-insn.h | 34 +++ arch/arm64/kernel/probes/kprobes.c | 525 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 1 + arch/arm64/mm/fault.c | 26 ++ 14 files changed, 859 insertions(+), 5 deletions(-) create mode 100644 arch/arm64/include/asm/kprobes.h create mode 100644 arch/arm64/include/asm/probes.h create mode 100644 arch/arm64/kernel/probes/Makefile create mode 100644 arch/arm64/kernel/probes/decode-insn.c create mode 100644 arch/arm64/kernel/probes/decode-insn.h create mode 100644 arch/arm64/kernel/probes/kprobes.c (limited to 'arch/arm64/mm') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fab133c18a09..1f7d644f7f36 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -88,6 +88,7 @@ config ARM64 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RCU_TABLE_FREE select HAVE_SYSCALL_TRACEPOINTS + select HAVE_KPROBES select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 2fcb9b7c876c..4b6b3f72a215 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -66,6 +66,11 @@ #define CACHE_FLUSH_IS_SAFE 1 +/* kprobes BRK opcodes with ESR encoding */ +#define BRK64_ESR_MASK 0xFFFF +#define BRK64_ESR_KPROBES 0x0004 +#define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5)) + /* AArch32 */ #define DBG_ESR_EVT_BKPT 0x4 #define DBG_ESR_EVT_VECC 0x5 diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index a44abbd61382..1dbaa901d7e5 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -253,6 +253,8 @@ __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) +__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) +__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) __AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000) __AARCH64_INSN_FUNCS(ldp_post, 0x7FC00000, 0x28C00000) __AARCH64_INSN_FUNCS(stp_pre, 0x7FC00000, 0x29800000) diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h new file mode 100644 index 000000000000..79c9511612b5 --- /dev/null +++ b/arch/arm64/include/asm/kprobes.h @@ -0,0 +1,60 @@ +/* + * arch/arm64/include/asm/kprobes.h + * + * Copyright (C) 2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _ARM_KPROBES_H +#define _ARM_KPROBES_H + +#include +#include +#include + +#define __ARCH_WANT_KPROBES_INSN_SLOT +#define MAX_INSN_SIZE 1 +#define MAX_STACK_SIZE 128 + +#define flush_insn_slot(p) do { } while (0) +#define kretprobe_blacklist_size 0 + +#include + +struct prev_kprobe { + struct kprobe *kp; + unsigned int status; +}; + +/* Single step context for kprobe */ +struct kprobe_step_ctx { + unsigned long ss_pending; + unsigned long match_addr; +}; + +/* per-cpu kprobe control block */ +struct kprobe_ctlblk { + unsigned int kprobe_status; + unsigned long saved_irqflag; + struct prev_kprobe prev_kprobe; + struct kprobe_step_ctx ss_ctx; + struct pt_regs jprobe_saved_regs; + char jprobes_stack[MAX_STACK_SIZE]; +}; + +void arch_remove_kprobe(struct kprobe *); +int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr); +int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data); +int kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr); +int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr); + +#endif /* _ARM_KPROBES_H */ diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h new file mode 100644 index 000000000000..1e8a21a96002 --- /dev/null +++ b/arch/arm64/include/asm/probes.h @@ -0,0 +1,34 @@ +/* + * arch/arm64/include/asm/probes.h + * + * Copyright (C) 2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef _ARM_PROBES_H +#define _ARM_PROBES_H + +struct kprobe; +struct arch_specific_insn; + +typedef u32 kprobe_opcode_t; +typedef unsigned long (kprobes_pstate_check_t)(unsigned long); +typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *); + +/* architecture specific copy of original instruction */ +struct arch_specific_insn { + kprobe_opcode_t *insn; + kprobes_pstate_check_t *pstate_cc; + kprobes_handler_t *handler; + /* restore address after step xol */ + unsigned long restore; +}; + +#endif diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 941e12d235be..baf938135986 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -147,9 +147,12 @@ struct pt_regs { #define fast_interrupts_enabled(regs) \ (!((regs)->pstate & PSR_F_BIT)) -#define user_stack_pointer(regs) \ +#define GET_USP(regs) \ (!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp) +#define SET_USP(ptregs, value) \ + (!compat_user_mode(regs) ? ((regs)->sp = value) : ((regs)->compat_sp = value)) + extern int regs_query_register_offset(const char *name); extern const char *regs_query_register_name(unsigned int offset); extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, @@ -206,8 +209,15 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) struct task_struct; int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); -#define instruction_pointer(regs) ((unsigned long)(regs)->pc) +#define GET_IP(regs) ((unsigned long)(regs)->pc) +#define SET_IP(regs, value) ((regs)->pc = ((u64) (value))) + +#define GET_FP(ptregs) ((unsigned long)(ptregs)->regs[29]) +#define SET_FP(ptregs, value) ((ptregs)->regs[29] = ((u64) (value))) + +#include +#undef profile_pc extern unsigned long profile_pc(struct pt_regs *regs); #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 4653aca9bb61..367673e38700 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -46,7 +46,7 @@ arm64-obj-$(CONFIG_PARAVIRT) += paravirt.o arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o -obj-y += $(arm64-obj-y) vdso/ +obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) head-y := head.o extra-y += $(head-y) vmlinux.lds diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 4fbf3c54275c..395de61108d8 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -266,6 +267,10 @@ static int single_step_handler(unsigned long addr, unsigned int esr, */ user_rewind_single_step(current); } else { +#ifdef CONFIG_KPROBES + if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return 0; +#endif if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED) return 0; @@ -322,8 +327,15 @@ static int brk_handler(unsigned long addr, unsigned int esr, { if (user_mode(regs)) { send_user_sigtrap(TRAP_BRKPT); - } else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) { - pr_warning("Unexpected kernel BRK exception at EL1\n"); + } +#ifdef CONFIG_KPROBES + else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { + if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED) + return -EFAULT; + } +#endif + else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) { + pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile new file mode 100644 index 000000000000..bc159bf56992 --- /dev/null +++ b/arch/arm64/kernel/probes/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c new file mode 100644 index 000000000000..95c0c5281e7b --- /dev/null +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -0,0 +1,143 @@ +/* + * arch/arm64/kernel/probes/decode-insn.c + * + * Copyright (C) 2013 Linaro Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +#include "decode-insn.h" + +static bool __kprobes aarch64_insn_is_steppable(u32 insn) +{ + /* + * Branch instructions will write a new value into the PC which is + * likely to be relative to the XOL address and therefore invalid. + * Deliberate generation of an exception during stepping is also not + * currently safe. Lastly, MSR instructions can do any number of nasty + * things we can't handle during single-stepping. + */ + if (aarch64_get_insn_class(insn) == AARCH64_INSN_CLS_BR_SYS) { + if (aarch64_insn_is_branch(insn) || + aarch64_insn_is_msr_imm(insn) || + aarch64_insn_is_msr_reg(insn) || + aarch64_insn_is_exception(insn) || + aarch64_insn_is_eret(insn)) + return false; + + /* + * The MRS instruction may not return a correct value when + * executing in the single-stepping environment. We do make one + * exception, for reading the DAIF bits. + */ + if (aarch64_insn_is_mrs(insn)) + return aarch64_insn_extract_system_reg(insn) + != AARCH64_INSN_SPCLREG_DAIF; + + /* + * The HINT instruction is is problematic when single-stepping, + * except for the NOP case. + */ + if (aarch64_insn_is_hint(insn)) + return aarch64_insn_is_nop(insn); + + return true; + } + + /* + * Instructions which load PC relative literals are not going to work + * when executed from an XOL slot. Instructions doing an exclusive + * load/store are not going to complete successfully when single-step + * exception handling happens in the middle of the sequence. + */ + if (aarch64_insn_uses_literal(insn) || + aarch64_insn_is_exclusive(insn)) + return false; + + return true; +} + +/* Return: + * INSN_REJECTED If instruction is one not allowed to kprobe, + * INSN_GOOD If instruction is supported and uses instruction slot, + */ +static enum kprobe_insn __kprobes +arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) +{ + /* + * Instructions reading or modifying the PC won't work from the XOL + * slot. + */ + if (aarch64_insn_is_steppable(insn)) + return INSN_GOOD; + else + return INSN_REJECTED; +} + +static bool __kprobes +is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) +{ + while (scan_start > scan_end) { + /* + * atomic region starts from exclusive load and ends with + * exclusive store. + */ + if (aarch64_insn_is_store_ex(le32_to_cpu(*scan_start))) + return false; + else if (aarch64_insn_is_load_ex(le32_to_cpu(*scan_start))) + return true; + scan_start--; + } + + return false; +} + +enum kprobe_insn __kprobes +arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) +{ + enum kprobe_insn decoded; + kprobe_opcode_t insn = le32_to_cpu(*addr); + kprobe_opcode_t *scan_start = addr - 1; + kprobe_opcode_t *scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + struct module *mod; +#endif + + if (addr >= (kprobe_opcode_t *)_text && + scan_end < (kprobe_opcode_t *)_text) + scan_end = (kprobe_opcode_t *)_text; +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + else { + preempt_disable(); + mod = __module_address((unsigned long)addr); + if (mod && within_module_init((unsigned long)addr, mod) && + !within_module_init((unsigned long)scan_end, mod)) + scan_end = (kprobe_opcode_t *)mod->init_layout.base; + else if (mod && within_module_core((unsigned long)addr, mod) && + !within_module_core((unsigned long)scan_end, mod)) + scan_end = (kprobe_opcode_t *)mod->core_layout.base; + preempt_enable(); + } +#endif + decoded = arm_probe_decode_insn(insn, asi); + + if (decoded == INSN_REJECTED || + is_probed_address_atomic(scan_start, scan_end)) + return INSN_REJECTED; + + return decoded; +} diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h new file mode 100644 index 000000000000..ad5ba9cc3d45 --- /dev/null +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -0,0 +1,34 @@ +/* + * arch/arm64/kernel/probes/decode-insn.h + * + * Copyright (C) 2013 Linaro Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _ARM_KERNEL_KPROBES_ARM64_H +#define _ARM_KERNEL_KPROBES_ARM64_H + +/* + * ARM strongly recommends a limit of 128 bytes between LoadExcl and + * StoreExcl instructions in a single thread of execution. So keep the + * max atomic context size as 32. + */ +#define MAX_ATOMIC_CONTEXT_SIZE (128 / sizeof(kprobe_opcode_t)) + +enum kprobe_insn { + INSN_REJECTED, + INSN_GOOD, +}; + +enum kprobe_insn __kprobes +arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); + +#endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c new file mode 100644 index 000000000000..44968019ce0d --- /dev/null +++ b/arch/arm64/kernel/probes/kprobes.c @@ -0,0 +1,525 @@ +/* + * arch/arm64/kernel/probes/kprobes.c + * + * Kprobes support for ARM64 + * + * Copyright (C) 2013 Linaro Limited. + * Author: Sandeepa Prabhu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "decode-insn.h" + +#define MIN_STACK_SIZE(addr) (on_irq_stack(addr, raw_smp_processor_id()) ? \ + min((unsigned long)IRQ_STACK_SIZE, \ + IRQ_STACK_PTR(raw_smp_processor_id()) - (addr)) : \ + min((unsigned long)MAX_STACK_SIZE, \ + (unsigned long)current_thread_info() + THREAD_START_SP - (addr))) + +void jprobe_return_break(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +static void __kprobes arch_prepare_ss_slot(struct kprobe *p) +{ + /* prepare insn slot */ + p->ainsn.insn[0] = cpu_to_le32(p->opcode); + + flush_icache_range((uintptr_t) (p->ainsn.insn), + (uintptr_t) (p->ainsn.insn) + + MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + + /* + * Needs restoring of return address after stepping xol. + */ + p->ainsn.restore = (unsigned long) p->addr + + sizeof(kprobe_opcode_t); +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long probe_addr = (unsigned long)p->addr; + extern char __start_rodata[]; + extern char __end_rodata[]; + + if (probe_addr & 0x3) + return -EINVAL; + + /* copy instruction */ + p->opcode = le32_to_cpu(*p->addr); + + if (in_exception_text(probe_addr)) + return -EINVAL; + if (probe_addr >= (unsigned long) __start_rodata && + probe_addr <= (unsigned long) __end_rodata) + return -EINVAL; + + /* decode instruction */ + switch (arm_kprobe_decode_insn(p->addr, &p->ainsn)) { + case INSN_REJECTED: /* insn not supported */ + return -EINVAL; + + case INSN_GOOD: /* instruction uses slot */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + break; + }; + + /* prepare the instruction */ + arch_prepare_ss_slot(p); + + return 0; +} + +static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) +{ + void *addrs[1]; + u32 insns[1]; + + addrs[0] = (void *)addr; + insns[0] = (u32)opcode; + + return aarch64_insn_patch_text(addrs, insns, 1); +} + +/* arm kprobe: install breakpoint in text */ +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + patch_text(p->addr, BRK64_OPCODE_KPROBES); +} + +/* disarm kprobe: remove breakpoint from text */ +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + patch_text(p->addr, p->opcode); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; +} + +static void __kprobes set_current_kprobe(struct kprobe *p) +{ + __this_cpu_write(current_kprobe, p); +} + +/* + * The D-flag (Debug mask) is set (masked) upon debug exception entry. + * Kprobes needs to clear (unmask) D-flag -ONLY- in case of recursive + * probe i.e. when probe hit from kprobe handler context upon + * executing the pre/post handlers. In this case we return with + * D-flag clear so that single-stepping can be carried-out. + * + * Leave D-flag set in all other cases. + */ +static void __kprobes +spsr_set_debug_flag(struct pt_regs *regs, int mask) +{ + unsigned long spsr = regs->pstate; + + if (mask) + spsr |= PSR_D_BIT; + else + spsr &= ~PSR_D_BIT; + + regs->pstate = spsr; +} + +/* + * Interrupts need to be disabled before single-step mode is set, and not + * reenabled until after single-step mode ends. + * Without disabling interrupt on local CPU, there is a chance of + * interrupt occurrence in the period of exception return and start of + * out-of-line single-step, that result in wrongly single stepping + * into the interrupt handler. + */ +static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + kcb->saved_irqflag = regs->pstate; + regs->pstate |= PSR_I_BIT; +} + +static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, + struct pt_regs *regs) +{ + if (kcb->saved_irqflag & PSR_I_BIT) + regs->pstate |= PSR_I_BIT; + else + regs->pstate &= ~PSR_I_BIT; +} + +static void __kprobes +set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr) +{ + kcb->ss_ctx.ss_pending = true; + kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t); +} + +static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb) +{ + kcb->ss_ctx.ss_pending = false; + kcb->ss_ctx.match_addr = 0; +} + +static void __kprobes setup_singlestep(struct kprobe *p, + struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) +{ + unsigned long slot; + + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_REENTER; + } else { + kcb->kprobe_status = KPROBE_HIT_SS; + } + + BUG_ON(!p->ainsn.insn); + + /* prepare for single stepping */ + slot = (unsigned long)p->ainsn.insn; + + set_ss_context(kcb, slot); /* mark pending ss */ + + if (kcb->kprobe_status == KPROBE_REENTER) + spsr_set_debug_flag(regs, 0); + + /* IRQs and single stepping do not mix well. */ + kprobes_save_local_irqflag(kcb, regs); + kernel_enable_single_step(regs); + instruction_pointer_set(regs, slot); +} + +static int __kprobes reenter_kprobe(struct kprobe *p, + struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + kprobes_inc_nmissed_count(p); + setup_singlestep(p, regs, kcb, 1); + break; + case KPROBE_HIT_SS: + case KPROBE_REENTER: + pr_warn("Unrecoverable kprobe detected at %p.\n", p->addr); + dump_kprobe(p); + BUG(); + break; + default: + WARN_ON(1); + return 0; + } + + return 1; +} + +static void __kprobes +post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + + if (!cur) + return; + + /* return addr restore if non-branching insn */ + if (cur->ainsn.restore != 0) + instruction_pointer_set(regs, cur->ainsn.restore); + + /* restore back original saved kprobe variables and continue */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + return; + } + /* call post handler */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) { + /* post_handler can hit breakpoint and single step + * again, so we enable D-flag for recursive exception. + */ + cur->post_handler(cur, regs, 0); + } + + reset_current_kprobe(); +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + instruction_pointer_set(regs, (unsigned long) cur->addr); + if (!instruction_pointer(regs)) + BUG(); + + kernel_disable_single_step(); + if (kcb->kprobe_status == KPROBE_REENTER) + spsr_set_debug_flag(regs, 1); + + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, fsr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + } + return 0; +} + +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + +static void __kprobes kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p, *cur_kprobe; + struct kprobe_ctlblk *kcb; + unsigned long addr = instruction_pointer(regs); + + kcb = get_kprobe_ctlblk(); + cur_kprobe = kprobe_running(); + + p = get_kprobe((kprobe_opcode_t *) addr); + + if (p) { + if (cur_kprobe) { + if (reenter_kprobe(p, regs, kcb)) + return; + } else { + /* Probe hit */ + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it prepped + * for calling the break_handler below on re-entry, + * so get out doing nothing more here. + * + * pre_handler can hit a breakpoint and can step thru + * before return, keep PSTATE D-flag enabled until + * pre_handler return back. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) { + setup_singlestep(p, regs, kcb, 0); + return; + } + } + } else if ((le32_to_cpu(*(kprobe_opcode_t *) addr) == + BRK64_OPCODE_KPROBES) && cur_kprobe) { + /* We probably hit a jprobe. Call its break handler. */ + if (cur_kprobe->break_handler && + cur_kprobe->break_handler(cur_kprobe, regs)) { + setup_singlestep(cur_kprobe, regs, kcb, 0); + return; + } + } + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Return back to original instruction, and continue. + */ +} + +static int __kprobes +kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) +{ + if ((kcb->ss_ctx.ss_pending) + && (kcb->ss_ctx.match_addr == addr)) { + clear_ss_context(kcb); /* clear pending ss */ + return DBG_HOOK_HANDLED; + } + /* not ours, kprobes should ignore it */ + return DBG_HOOK_ERROR; +} + +int __kprobes +kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + int retval; + + /* return error if this is not our step */ + retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); + + if (retval == DBG_HOOK_HANDLED) { + kprobes_restore_local_irqflag(kcb, regs); + kernel_disable_single_step(); + + if (kcb->kprobe_status == KPROBE_REENTER) + spsr_set_debug_flag(regs, 1); + + post_kprobe_handler(kcb, regs); + } + + return retval; +} + +int __kprobes +kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) +{ + kprobe_handler(regs); + return DBG_HOOK_HANDLED; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + long stack_ptr = kernel_stack_pointer(regs); + + kcb->jprobe_saved_regs = *regs; + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (void *)stack_ptr, + MIN_STACK_SIZE(stack_ptr)); + + instruction_pointer_set(regs, (unsigned long) jp->entry); + preempt_disable(); + pause_graph_tracing(); + return 1; +} + +void __kprobes jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + /* + * Jprobe handler return by entering break exception, + * encoded same as kprobe, but with following conditions + * -a magic number in x0 to identify from rest of other kprobes. + * -restore stack addr to original saved pt_regs + */ + asm volatile ("ldr x0, [%0]\n\t" + "mov sp, x0\n\t" + ".globl jprobe_return_break\n\t" + "jprobe_return_break:\n\t" + "brk %1\n\t" + : + : "r"(&kcb->jprobe_saved_regs.sp), + "I"(BRK64_ESR_KPROBES) + : "memory"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + long stack_addr = kcb->jprobe_saved_regs.sp; + long orig_sp = kernel_stack_pointer(regs); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if (instruction_pointer(regs) != (u64) jprobe_return_break) + return 0; + + if (orig_sp != stack_addr) { + struct pt_regs *saved_regs = + (struct pt_regs *)kcb->jprobe_saved_regs.sp; + pr_err("current sp %lx does not match saved sp %lx\n", + orig_sp, stack_addr); + pr_err("Saved registers for jprobe %p\n", jp); + show_regs(saved_regs); + pr_err("Current registers\n"); + show_regs(regs); + BUG(); + } + unpause_graph_tracing(); + *regs = kcb->jprobe_saved_regs; + memcpy((void *)stack_addr, kcb->jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + preempt_enable_no_resched(); + return 1; +} + +int __init arch_init_kprobes(void) +{ + return 0; +} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 435e820e898d..075ce322e82b 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -121,6 +121,7 @@ SECTIONS TEXT_TEXT SCHED_TEXT LOCK_TEXT + KPROBES_TEXT HYPERVISOR_TEXT IDMAP_TEXT HIBERNATE_TEXT diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 013e2cbe7924..2408e51d781f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -41,6 +41,28 @@ static const char *fault_name(unsigned int esr); +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, esr)) + ret = 1; + preempt_enable(); + } + + return ret; +} +#else +static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) +{ + return 0; +} +#endif + /* * Dump out the page tables associated with 'addr' in mm 'mm'. */ @@ -259,6 +281,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (notify_page_fault(regs, esr)) + return 0; + tsk = current; mm = tsk->mm; @@ -629,6 +654,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr, return rv; } +NOKPROBE_SYMBOL(do_debug_exception); #ifdef CONFIG_ARM64_PAN void cpu_enable_pan(void *__unused) -- cgit v1.2.3 From 2ce39ad15182604beb6c8fa8bed5e46b59fd1082 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 19 Jul 2016 15:07:37 +0100 Subject: arm64: debug: unmask PSTATE.D earlier Clearing PSTATE.D is one of the requirements for generating a debug exception. The arm64 booting protocol requires that PSTATE.D is set, since many of the debug registers (for example, the hw_breakpoint registers) are UNKNOWN out of reset and could potentially generate spurious, fatal debug exceptions in early boot code if PSTATE.D was clear. Once the debug registers have been safely initialised, PSTATE.D is cleared, however this is currently broken for two reasons: (1) The boot CPU clears PSTATE.D in a postcore_initcall and secondary CPUs clear PSTATE.D in secondary_start_kernel. Since the initcall runs after SMP (and the scheduler) have been initialised, there is no guarantee that it is actually running on the boot CPU. In this case, the boot CPU is left with PSTATE.D set and is not capable of generating debug exceptions. (2) In a preemptible kernel, we may explicitly schedule on the IRQ return path to EL1. If an IRQ occurs with PSTATE.D set in the idle thread, then we may schedule the kthread_init thread, run the postcore_initcall to clear PSTATE.D and then context switch back to the idle thread before returning from the IRQ. The exception return path will then restore PSTATE.D from the stack, and set it again. This patch fixes the problem by moving the clearing of PSTATE.D earlier to proc.S. This has the desirable effect of clearing it in one place for all CPUs, long before we have to worry about the scheduler or any exception handling. We ensure that the previous reset of MDSCR_EL1 has completed before unmasking the exception, so that any spurious exceptions resulting from UNKNOWN debug registers are not generated. Without this patch applied, the kprobes selftests have been seen to fail under KVM, where we end up attempting to step the OOL instruction buffer with PSTATE.D set and therefore fail to complete the step. Cc: Acked-by: Mark Rutland Reported-by: Catalin Marinas Tested-by: Marc Zyngier Signed-off-by: Will Deacon Reviewed-by: Catalin Marinas Tested-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/kernel/debug-monitors.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/proc.S | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 4fbf3c54275c..0800d23e2fdd 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -151,7 +151,6 @@ static int debug_monitors_init(void) /* Clear the OS lock. */ on_each_cpu(clear_os_lock, NULL, 1); isb(); - local_dbg_enable(); /* Register hotplug handler. */ __register_cpu_notifier(&os_lock_nb); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 62ff3c0622e2..c0a772560ab7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -267,7 +267,6 @@ asmlinkage void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); - local_dbg_enable(); local_irq_enable(); local_async_enable(); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index c4317879b938..5bb61de23201 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -180,6 +180,8 @@ ENTRY(__cpu_setup) msr cpacr_el1, x0 // Enable FP/ASIMD mov x0, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x0 // access to the DCC from EL0 + isb // Unmask debug exceptions now, + enable_dbg // since this is per-cpu reset_pmuserenr_el0 x0 // Disable PMU access from EL0 /* * Memory region attributes for LPAE: -- cgit v1.2.3 From 258a1605c78878ada00417de4840c4c427673ffa Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 22 Jul 2016 19:32:24 +0200 Subject: arm64: mm: make create_mapping_late() non-allocating The only purpose served by create_mapping_late() is to remap the already mapped .text and .rodata kernel segments with read-only permissions. Since we no longer allow block mappings to be split or merged, create_mapping_late() should not pass an allocation function pointer into __create_pgd_mapping(). So pass NULL instead. Signed-off-by: Ard Biesheuvel Reviewed-by: Laura Abbott Tested-by: Sudeep Holla Acked-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a96a2413fa18..33f36cede02d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -312,7 +312,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - late_pgtable_alloc, !debug_pagealloc_enabled()); + NULL, !debug_pagealloc_enabled()); } static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) -- cgit v1.2.3 From 1378dc3d4ba07ccd295b04ef59e943162531ad25 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 22 Jul 2016 19:32:25 +0200 Subject: arm64: mm: run pgtable_page_ctor() on non-swapper translation table pages The kernel page table creation routines are accessible to other subsystems (e.g., EFI) via the create_pgd_mapping() entry point, which allows mappings to be created that are not covered by init_mm. Since generic code such as apply_to_page_range() may expect translation table pages that are not associated with init_mm to be covered by fully constructed struct pages, add a call to pgtable_page_ctor() in the alloc function used by create_pgd_mapping. Since it is no longer used by create_mapping_late(), also update the name of this function to better reflect its purpose. Signed-off-by: Ard Biesheuvel Reviewed-by: Laura Abbott Tested-by: Sudeep Holla Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 33f36cede02d..51a558195bb9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -268,10 +268,11 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, } while (pgd++, addr = next, addr != end); } -static phys_addr_t late_pgtable_alloc(void) +static phys_addr_t pgd_pgtable_alloc(void) { void *ptr = (void *)__get_free_page(PGALLOC_GFP); - BUG_ON(!ptr); + if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + BUG(); /* Ensure the zeroed page is visible to the page table walker */ dsb(ishst); @@ -298,8 +299,10 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, bool allow_block_mappings) { + BUG_ON(mm == &init_mm); + __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - late_pgtable_alloc, allow_block_mappings); + pgd_pgtable_alloc, allow_block_mappings); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, -- cgit v1.2.3