From 910154d520c97cd0095a889e6b878041c91111a6 Mon Sep 17 00:00:00 2001 From: Geoffrey Thomas Date: Wed, 9 Mar 2016 14:08:04 -0800 Subject: mm/hugetlb: hugetlb_no_page: rate-limit warning message The warning message "killed due to inadequate hugepage pool" simply indicates that SIGBUS was sent, not that the process was forcibly killed. If the process has a signal handler installed does not fix the problem, this message can rapidly spam the kernel log. On my amd64 dev machine that does not have hugepages configured, I can reproduce the repeated warnings easily by setting vm.nr_hugepages=2 (i.e., 4 megabytes of huge pages) and running something that sets a signal handler and forks, like #include #include #include #include sig_atomic_t counter = 10; void handler(int signal) { if (counter-- == 0) exit(0); } int main(void) { int status; char *addr = mmap(NULL, 4 * 1048576, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (addr == MAP_FAILED) {perror("mmap"); return 1;} *addr = 'x'; switch (fork()) { case -1: perror("fork"); return 1; case 0: signal(SIGBUS, handler); *addr = 'x'; break; default: *addr = 'x'; wait(&status); if (WIFSIGNALED(status)) { psignal(WTERMSIG(status), "child"); } break; } } Signed-off-by: Geoffrey Thomas Cc: Naoya Horiguchi Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 01f2b48c8618..0e27a9db6eb9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3502,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { - pr_warning("PID %d killed due to inadequate hugepage pool\n", + pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); return ret; } -- cgit v1.2.3 From 06b241f32c711d7ca868a0351dd97fe91fd8817b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 9 Mar 2016 14:08:07 -0800 Subject: mm: __delete_from_page_cache show Bad page if mapped Commit e1534ae95004 ("mm: differentiate page_mapped() from page_mapcount() for compound pages") changed the famous BUG_ON(page_mapped(page)) in __delete_from_page_cache() to VM_BUG_ON_PAGE(page_mapped(page)): which gives us more info when CONFIG_DEBUG_VM=y, but nothing at all when not. Although it has not usually been very helpul, being hit long after the error in question, we do need to know if it actually happens on users' systems; but reinstating a crash there is likely to be opposed :) In the non-debug case, pr_alert("BUG: Bad page cache") plus dump_page(), dump_stack(), add_taint() - I don't really believe LOCKDEP_NOW_UNRELIABLE, but that seems to be the standard procedure now. Move that, or the VM_BUG_ON_PAGE(), up before the deletion from tree: so that the unNULLified page->mapping gives a little more information. If the inode is being evicted (rather than truncated), it won't have any vmas left, so it's safe(ish) to assume that the raised mapcount is erroneous, and we can discount it from page_count to avoid leaking the page (I'm less worried by leaking the occasional 4kB, than losing a potential 2MB page with each 4kB page leaked). Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Joonsoo Kim Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3461d97ecb30..da7a35d83de7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -195,6 +195,30 @@ void __delete_from_page_cache(struct page *page, void *shadow, else cleancache_invalidate_page(mapping, page); + VM_BUG_ON_PAGE(page_mapped(page), page); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + int mapcount; + + pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + dump_page(page, "still mapped when deleted"); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + mapcount = page_mapcount(page); + if (mapping_exiting(mapping) && + page_count(page) >= mapcount + 2) { + /* + * All vmas have already been torn down, so it's + * a good bet that actually the page is unmapped, + * and we'd prefer not to leak it: if we're wrong, + * some other bad page check should catch it later. + */ + page_mapcount_reset(page); + atomic_sub(mapcount, &page->_count); + } + } + page_cache_tree_delete(mapping, page, shadow); page->mapping = NULL; @@ -205,7 +229,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); - VM_BUG_ON_PAGE(page_mapped(page), page); /* * At this point page must be either written or cleaned by truncate. -- cgit v1.2.3 From d77a117e6871ff78a06def46583d23752593de60 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 9 Mar 2016 14:08:10 -0800 Subject: list: kill list_force_poison() Given we have uninitialized list_heads being passed to list_add() it will always be the case that those uninitialized values randomly trigger the poison value. Especially since a list_add() operation will seed the stack with the poison value for later stack allocations to trip over. For example, see these two false positive reports: list_add attempted on force-poisoned entry WARNING: at lib/list_debug.c:34 [..] NIP [c00000000043c390] __list_add+0xb0/0x150 LR [c00000000043c38c] __list_add+0xac/0x150 Call Trace: __list_add+0xac/0x150 (unreliable) __down+0x4c/0xf8 down+0x68/0x70 xfs_buf_lock+0x4c/0x150 [xfs] list_add attempted on force-poisoned entry(0000000000000500), new->next == d0000000059ecdb0, new->prev == 0000000000000500 WARNING: at lib/list_debug.c:33 [..] NIP [c00000000042db78] __list_add+0xa8/0x140 LR [c00000000042db74] __list_add+0xa4/0x140 Call Trace: __list_add+0xa4/0x140 (unreliable) rwsem_down_read_failed+0x6c/0x1a0 down_read+0x58/0x60 xfs_log_commit_cil+0x7c/0x600 [xfs] Fixes: commit 5c2c2587b132 ("mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup") Signed-off-by: Dan Williams Reported-by: Eryu Guan Tested-by: Eryu Guan Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 11 ----------- kernel/memremap.c | 9 +++++++-- lib/list_debug.c | 9 --------- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 30cf4200ab40..5356f4d661a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry); extern void list_del(struct list_head *entry); #endif -#ifdef CONFIG_DEBUG_LIST -/* - * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one - * of the pages it allocates is ever passed to list_add() - */ -extern void list_force_poison(struct list_head *entry); -#else -/* fallback to the less strict LIST_POISON* definitions */ -#define list_force_poison list_del -#endif - /** * list_replace - replace old entry by new one * @old : the element to be replaced diff --git a/kernel/memremap.c b/kernel/memremap.c index b981a7b023f0..778191e3e887 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -351,8 +351,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, for_each_device_pfn(pfn, page_map) { struct page *page = pfn_to_page(pfn); - /* ZONE_DEVICE pages must never appear on a slab lru */ - list_force_poison(&page->lru); + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer. It is a bug if a ZONE_DEVICE page is ever + * freed or placed on a driver-private list. Seed the + * storage with LIST_POISON* values. + */ + list_del(&page->lru); page->pgmap = pgmap; } devres_add(dev, page_map); diff --git a/lib/list_debug.c b/lib/list_debug.c index 3345a089ef7b..3859bf63561c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -12,13 +12,6 @@ #include #include -static struct list_head force_poison; -void list_force_poison(struct list_head *entry) -{ - entry->next = &force_poison; - entry->prev = &force_poison; -} - /* * Insert a new entry between two known consecutive entries. * @@ -30,8 +23,6 @@ void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { - WARN(new->next == &force_poison || new->prev == &force_poison, - "list_add attempted on force-poisoned entry\n"); WARN(next->prev != prev, "list_add corruption. next->prev should be " "prev (%p), but was %p. (next=%p).\n", -- cgit v1.2.3 From 5f29a77cd95771edebbf41e5800fdd557c69302a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 9 Mar 2016 14:08:13 -0800 Subject: mm: fix mixed zone detection in devm_memremap_pages The check for whether we overlap "System RAM" needs to be done at section granularity. For example a system with the following mapping: 100000000-37bffffff : System RAM 37c000000-837ffffff : Persistent Memory ...is unable to use devm_memremap_pages() as it would result in two zones colliding within a given section. Signed-off-by: Dan Williams Cc: Ross Zwisler Reviewed-by: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 778191e3e887..60baf4d3401e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -270,13 +270,16 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; + int error, nid, is_ram; unsigned long pfn; - int error, nid; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; + is_ram = region_intersects(align_start, align_size, "System RAM"); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -314,8 +317,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); align_end = align_start + align_size - 1; for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; -- cgit v1.2.3 From e3ae116339f9a0c77523abc95e338fa405946e07 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Mar 2016 14:08:15 -0800 Subject: kasan: add functions to clear stack poison Functions which the compiler has instrumented for ASAN place poison on the stack shadow upon entry and remove this poison prior to returning. In some cases (e.g. hotplug and idle), CPUs may exit the kernel a number of levels deep in C code. If there are any instrumented functions on this critical path, these will leave portions of the idle thread stack shadow poisoned. If a CPU returns to the kernel via a different path (e.g. a cold entry), then depending on stack frame layout subsequent calls to instrumented functions may use regions of the stack with stale poison, resulting in (spurious) KASAN splats to the console. Contemporary GCCs always add stack shadow poisoning when ASAN is enabled, even when asked to not instrument a function [1], so we can't simply annotate functions on the critical path to avoid poisoning. Instead, this series explicitly removes any stale poison before it can be hit. In the common hotplug case we clear the entire stack shadow in common code, before a CPU is brought online. On architectures which perform a cold return as part of cpu idle may retain an architecture-specific amount of stack contents. To retain the poison for this retained context, the arch code must call the core KASAN code, passing a "watermark" stack pointer value beyond which shadow will be cleared. Architectures which don't perform a cold return as part of idle do not need any additional code. This patch (of 3): Functions which the compiler has instrumented for KASAN place poison on the stack shadow upon entry and remove this poision prior to returning. In some cases (e.g. hotplug and idle), CPUs may exit the kernel a number of levels deep in C code. If there are any instrumented functions on this critical path, these will leave portions of the stack shadow poisoned. If a CPU returns to the kernel via a different path (e.g. a cold entry), then depending on stack frame layout subsequent calls to instrumented functions may use regions of the stack with stale poison, resulting in (spurious) KASAN splats to the console. To avoid this, we must clear stale poison from the stack prior to instrumented functions being called. This patch adds functions to the KASAN core for removing poison from (portions of) a task's stack. These will be used by subsequent patches to avoid problems with hotplug and idle. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Lorenzo Pieralisi Cc: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++++- mm/kasan/kasan.c | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4b9f85c963d0..0fdc798e3ff7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -1,6 +1,7 @@ #ifndef _LINUX_KASAN_H #define _LINUX_KASAN_H +#include #include struct kmem_cache; @@ -13,7 +14,6 @@ struct vm_struct; #include #include -#include extern unsigned char kasan_zero_page[PAGE_SIZE]; extern pte_t kasan_zero_pte[PTRS_PER_PTE]; @@ -43,6 +43,8 @@ static inline void kasan_disable_current(void) void kasan_unpoison_shadow(const void *address, size_t size); +void kasan_unpoison_task_stack(struct task_struct *task); + void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); @@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm); static inline void kasan_unpoison_shadow(const void *address, size_t size) {} +static inline void kasan_unpoison_task_stack(struct task_struct *task) {} + static inline void kasan_enable_current(void) {} static inline void kasan_disable_current(void) {} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index bc0a8d8b8f42..1ad20ade8c91 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } +static void __kasan_unpoison_stack(struct task_struct *task, void *sp) +{ + void *base = task_stack_page(task); + size_t size = sp - base; + + kasan_unpoison_shadow(base, size); +} + +/* Unpoison the entire stack for a task. */ +void kasan_unpoison_task_stack(struct task_struct *task) +{ + __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE); +} + +/* Unpoison the stack for the current task beyond a watermark sp value. */ +asmlinkage void kasan_unpoison_remaining_stack(void *sp) +{ + __kasan_unpoison_stack(current, sp); +} /* * All functions below always inlined so compiler could -- cgit v1.2.3 From e1b77c92981a522223bd1ac118fdcade6b7ad086 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Mar 2016 14:08:18 -0800 Subject: sched/kasan: remove stale KASAN poison after hotplug Functions which the compiler has instrumented for KASAN place poison on the stack shadow upon entry and remove this poision prior to returning. In the case of CPU hotplug, CPUs exit the kernel a number of levels deep in C code. Any instrumented functions on this critical path will leave portions of the stack shadow poisoned. When a CPU is subsequently brought back into the kernel via a different path, depending on stackframe, layout calls to instrumented functions may hit this stale poison, resulting in (spurious) KASAN splats to the console. To avoid this, clear any stale poison from the idle thread for a CPU prior to bringing a CPU online. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Reviewed-by: Andrey Ryabinin Reviewed-by: Ingo Molnar Cc: Alexander Potapenko Cc: Lorenzo Pieralisi Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..41f6b2215aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include #include #include #include @@ -5096,6 +5097,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, -- cgit v1.2.3 From 0d97e6d8024c71cc838b292c01d5bd951e080eba Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Mar 2016 14:08:21 -0800 Subject: arm64: kasan: clear stale stack poison Functions which the compiler has instrumented for KASAN place poison on the stack shadow upon entry and remove this poison prior to returning. In the case of cpuidle, CPUs exit the kernel a number of levels deep in C code. Any instrumented functions on this critical path will leave portions of the stack shadow poisoned. If CPUs lose context and return to the kernel via a cold path, we restore a prior context saved in __cpu_suspend_enter are forgotten, and we never remove the poison they placed in the stack shadow area by functions calls between this and the actual exit of the kernel. Thus, (depending on stackframe layout) subsequent calls to instrumented functions may hit this stale poison, resulting in (spurious) KASAN splats to the console. To avoid this, clear any stale poison from the idle thread for a CPU prior to bringing a CPU online. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Reviewed-by: Andrey Ryabinin Reviewed-by: Lorenzo Pieralisi Cc: Alexander Potapenko Cc: Catalin Marinas Cc: Will Deacon Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/sleep.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index e33fe33876ab..fd10eb663868 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -145,6 +145,10 @@ ENTRY(cpu_resume_mmu) ENDPROC(cpu_resume_mmu) .popsection cpu_resume_after_mmu: +#ifdef CONFIG_KASAN + mov x0, sp + bl kasan_unpoison_remaining_stack +#endif mov x0, #0 // return zero on success ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] -- cgit v1.2.3 From 566e8dfd88d9e0d12873ca69c26e82c0af8479d8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 14:08:24 -0800 Subject: ocfs2: fix return value from ocfs2_page_mkwrite() ocfs2_page_mkwrite() could mistakenly return error code instead of mkwrite status value. Fix it. Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9581d190f6e1..77ebc2bc1cca 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } -- cgit v1.2.3 From 30f471fd88e0304bee2c17ef1a4651e705870817 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 9 Mar 2016 14:08:27 -0800 Subject: dax: check return value of dax_radix_entry() dax_pfn_mkwrite() previously wasn't checking the return value of the call to dax_radix_entry(), which was a mistake. Instead, capture this return value and return the appropriate VM_FAULT_ value. Signed-off-by: Ross Zwisler Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 711172450da6..bbb2ad783770 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1056,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; + int error; /* * We pass NO_SECTOR to dax_radix_entry() because we expect that a @@ -1065,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * saves us from having to make a call to get_block() here to look * up the sector. */ - dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); + error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, + true); + + if (error == -ENOMEM) + return VM_FAULT_OOM; + if (error) + return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); -- cgit v1.2.3 From 0a2e280b6d8ea4afef07c749070705d6af403b7f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 9 Mar 2016 14:08:30 -0800 Subject: mm, thp: fix migration of PTE-mapped transparent huge pages We don't have native support of THP migration, so we have to split huge page into small pages in order to migrate it to different node. This includes PTE-mapped huge pages. I made mistake in refcounting patchset: we don't actually split PTE-mapped huge page in queue_pages_pte_range(), if we step on head page. The result is that the head page is queued for migration, but none of tail pages: putting head page on queue takes pin on the page and any subsequent attempts of split_huge_pages() would fail and we skip queuing tail pages. unmap_and_move_huge_page() will eventually split the huge pages, but only one of 512 pages would get migrated. Let's fix the situation. Fixes: 248db92da13f2507 ("migrate_pages: try to split pages on queuing") Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c4187c0e1de..9a3f6b90e628 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -532,7 +532,7 @@ retry: nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; - if (PageTail(page) && PageAnon(page)) { + if (PageTransCompound(page) && PageAnon(page)) { get_page(page); pte_unmap_unlock(pte, ptl); lock_page(page); -- cgit v1.2.3 From ac343e882a8377caef5fa75d9093cb77e9d4bf6d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 9 Mar 2016 14:08:32 -0800 Subject: memremap: check pfn validity before passing to pfn_to_page() In memremap's helper function try_ram_remap(), we dereference a struct page pointer that was derived from a PFN that is known to be covered by a 'System RAM' iomem region, and is thus assumed to be a 'valid' PFN, i.e., a PFN that has a struct page associated with it and is covered by the kernel direct mapping. However, the assumption that there is a 1:1 relation between the System RAM iomem region and the kernel direct mapping is not universally valid on all architectures, and on ARM and arm64, 'System RAM' may include regions for which pfn_valid() returns false. Generally speaking, both __va() and pfn_to_page() should only ever be called on PFNs/physical addresses for which pfn_valid() returns true, so add that check to try_ram_remap(). Signed-off-by: Ard Biesheuvel Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 60baf4d3401e..6cf54615a9c4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) static void *try_ram_remap(resource_size_t offset, size_t size) { - struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (!PageHighMem(page)) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); return NULL; /* fallback to ioremap_cache */ } -- cgit v1.2.3 From 86613628b3d367743f71b945c203774c522404f4 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Wed, 9 Mar 2016 14:08:35 -0800 Subject: mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers Replace ENOTSUPP with EOPNOTSUPP. If hugepages are not supported, this value is propagated to userspace. EOPNOTSUPP is part of uapi and is widely supported by libc libraries. It gives nicer message to user, rather than: # cat /proc/sys/vm/nr_hugepages cat: /proc/sys/vm/nr_hugepages: Unknown error 524 And also LTP's proc01 test was failing because this ret code (524) was unexpected: proc01 1 TFAIL : proc01.c:396: read failed: /proc/sys/vm/nr_hugepages: errno=???(524): Unknown error 524 proc01 2 TFAIL : proc01.c:396: read failed: /proc/sys/vm/nr_hugepages_mempolicy: errno=???(524): Unknown error 524 proc01 3 TFAIL : proc01.c:396: read failed: /proc/sys/vm/nr_overcommit_hugepages: errno=???(524): Unknown error 524 Signed-off-by: Jan Stancek Acked-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Acked-by: David Rientjes Acked-by: Hillf Danton Cc: Mike Kravetz Cc: Dave Hansen Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0e27a9db6eb9..aefba5a9cc47 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2751,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, int ret; if (!hugepages_supported()) - return -ENOTSUPP; + return -EOPNOTSUPP; table->data = &tmp; table->maxlen = sizeof(unsigned long); @@ -2792,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, int ret; if (!hugepages_supported()) - return -ENOTSUPP; + return -EOPNOTSUPP; tmp = h->nr_overcommit_huge_pages; -- cgit v1.2.3 From d6b7eaeb03421139e32800324ef04ab50bba886d Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 9 Mar 2016 14:08:38 -0800 Subject: dma-mapping: avoid oops when parameter cpu_addr is null To keep consistent with kfree, which tolerate ptr is NULL. We do this because sometimes we may use goto statement, so that success and failure case can share parts of the code. But unfortunately, dma_free_coherent called with parameter cpu_addr is null will cause oops, such as showed below: Unable to handle kernel paging request at virtual address ffffffc020d3b2b8 pgd = ffffffc083a61000 [ffffffc020d3b2b8] *pgd=0000000000000000, *pud=0000000000000000 CPU: 4 PID: 1489 Comm: malloc_dma_1 Tainted: G O 4.1.12 #1 Hardware name: ARM64 (DT) PC is at __dma_free_coherent.isra.10+0x74/0xc8 LR is at __dma_free+0x9c/0xb0 Process malloc_dma_1 (pid: 1489, stack limit = 0xffffffc0837fc020) [...] Call trace: __dma_free_coherent.isra.10+0x74/0xc8 __dma_free+0x9c/0xb0 malloc_dma+0x104/0x158 [dma_alloc_coherent_mtmalloc] kthread+0xec/0xfc Signed-off-by: Zhen Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 75857cda38e9..728ef074602a 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -386,7 +386,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size, if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) return; - if (!ops->free) + if (!ops->free || !cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); -- cgit v1.2.3