From b469d4329cf949043f9b93a6644f2c64015ef8cd Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 11 Jan 2012 05:51:10 +0000 Subject: kmemleak: Only scan non-zero-size areas Kmemleak should only track valid scan areas with a non-zero size. Otherwise, such area may reside just at the end of an object and kmemleak would report "Adding scan area to unknown object". Signed-off-by: Tiejun Chen Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c833addd94d7..f9f7310f0fdb 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1036,7 +1036,7 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); -- cgit v1.2.3 From b370d29ea7565a638ccf85389488364b5abb39fa Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 20 Jan 2012 10:42:40 +0000 Subject: kmemleak: Disable early logging when kmemleak is off by default Commit b6693005 (kmemleak: When the early log buffer is exceeded, report the actual number) deferred the disabling of the early logging to kmemleak_init(). However, when CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, the early logging was no longer disabled causing __init kmemleak functions to be called even after the kernel freed the init memory. This patch disables the early logging during kmemleak_init() if kmemleak is left disabled. Reported-by: Dirk Gouders Tested-by: Dirk Gouders Tested-by: Josh Boyer Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f9f7310f0fdb..45eb6217bf38 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1757,6 +1757,7 @@ void __init kmemleak_init(void) #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { + atomic_set(&kmemleak_early_log, 0); kmemleak_disable(); return; } -- cgit v1.2.3 From 2673b4cf5d59c3ee5e0c12f6d734d38770324dc4 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 29 Jan 2012 12:17:33 -0600 Subject: backing-dev: fix wakeup timer races with bdi_unregister() While 7a401a972df8e18 ("backing-dev: ensure wakeup_timer is deleted") addressed the problem of the bdi being freed with a queued wakeup timer, there are other races that could happen if the wakeup timer expires after/during bdi_unregister(), before bdi_destroy() is called. wakeup_timer_fn() could attempt to wakeup a task which has already has been freed, or could access a NULL bdi->dev via the wake_forker_thread tracepoint. Cc: Cc: Jens Axboe Reported-by: Chanho Min Reviewed-by: Namjae Jeon Signed-off-by: Rabin Vincent Signed-off-by: Wu Fengguang --- mm/backing-dev.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7ba8feae11b8..dd8e2aafb07e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -318,7 +318,7 @@ static void wakeup_timer_fn(unsigned long data) if (bdi->wb.task) { trace_writeback_wake_thread(bdi); wake_up_process(bdi->wb.task); - } else { + } else if (bdi->dev) { /* * When bdi tasks are inactive for long time, they are killed. * In this case we have to wake-up the forker thread which @@ -584,6 +584,8 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { + struct task_struct *task; + if (!bdi_cap_writeback_dirty(bdi)) return; @@ -602,8 +604,13 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. */ - if (bdi->wb.task) - kthread_stop(bdi->wb.task); + spin_lock_bh(&bdi->wb_lock); + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock_bh(&bdi->wb_lock); + + if (task) + kthread_stop(task); } /* @@ -623,7 +630,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { + struct device *dev = bdi->dev; + + if (dev) { bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); @@ -632,8 +641,12 @@ void bdi_unregister(struct backing_dev_info *bdi) if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); - device_unregister(bdi->dev); + + spin_lock_bh(&bdi->wb_lock); bdi->dev = NULL; + spin_unlock_bh(&bdi->wb_lock); + + device_unregister(dev); } } EXPORT_SYMBOL(bdi_unregister); -- cgit v1.2.3 From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 2 Feb 2012 11:34:09 +1030 Subject: Fix race in process_vm_rw_core This fixes the race in process_vm_core found by Oleg (see http://article.gmane.org/gmane.linux.kernel/1235667/ for details). This has been updated since I last sent it as the creation of the new mm_access() function did almost exactly the same thing as parts of the previous version of this patch did. In order to use mm_access() even when /proc isn't enabled, we move it to kernel/fork.c where other related process mm access functions already are. Signed-off-by: Chris Yeoh Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 -------------------- include/linux/sched.h | 6 ++++++ kernel/fork.c | 20 ++++++++++++++++++++ mm/process_vm_access.c | 23 +++++++++-------------- 4 files changed, 35 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/fs/proc/base.c b/fs/proc/base.c index d9512bd03e6c..d4548dd49b02 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - struct mm_struct *mm_for_maps(struct task_struct *task) { return mm_access(task, PTRACE_MODE_READ); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2234985a5e65..7d379a6bfd88 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2259,6 +2259,12 @@ static inline void mmdrop(struct mm_struct * mm) extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); +/* + * Grab a reference to a task's mm, if it is not already going away + * and ptrace_may_access with the mode parameter passed to it + * succeeds. + */ +extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c1..1b2ef3c23ae4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -647,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm; + int err; + + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, mode)) { + mmput(mm); + mm = ERR_PTR(-EACCES); + } + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e920aa3ce104..c20ff48994c2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -298,23 +298,18 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto free_proc_pages; } - task_lock(task); - if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { - task_unlock(task); - rc = -EPERM; - goto put_task_struct; - } - mm = task->mm; - - if (!mm || (task->flags & PF_KTHREAD)) { - task_unlock(task); - rc = -EINVAL; + mm = mm_access(task, PTRACE_MODE_ATTACH); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more a + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; goto put_task_struct; } - atomic_inc(&mm->mm_users); - task_unlock(task); - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, -- cgit v1.2.3 From 35512ecaef03250fe50ad81430dd467f01d9a96b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 3 Feb 2012 15:37:13 -0800 Subject: mm: postpone migrated page mapping reset Postpone resetting page->mapping until the final remove_migration_ptes(). Otherwise the expression PageAnon(migration_entry_to_page(entry)) does not work. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 9871a56d82c3..df141f60289e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -445,7 +445,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); - page->mapping = NULL; /* * If any waiters have accumulated on the new page then @@ -667,6 +666,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } else { if (remap_swapcache) remove_migration_ptes(page, newpage); + page->mapping = NULL; } unlock_page(newpage); -- cgit v1.2.3 From 82b3f2a7171731cce62f25058d25afb91a14710c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 3 Feb 2012 15:37:14 -0800 Subject: mm/memcontrol.c: fix warning with CONFIG_NUMA=n mm/memcontrol.c: In function 'memcg_check_events': mm/memcontrol.c:779: warning: unused variable 'do_numainfo' Acked-by: Michal Hocko Cc: Li Zefan Cc: Hiroyuki KAMEZAWA Cc: Johannes Weiner Acked-by: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 556859fec4ef..6728a7ae6f2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -776,7 +776,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit, do_numainfo; + bool do_softlimit; + bool do_numainfo __maybe_unused; do_softlimit = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); -- cgit v1.2.3 From 99f02ef1f18631eb0a4e0ea0a3d56878dbcb4b90 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Fri, 3 Feb 2012 15:37:14 -0800 Subject: mm/filemap_xip.c: fix race condition in xip_file_fault() Fix a race condition that shows in conjunction with xip_file_fault() when two threads of the same user process fault on the same memory page. In this case, the race winner will install the page table entry and the unlucky loser will cause an oops: xip_file_fault calls vm_insert_pfn (via vm_insert_mixed) which drops out at this check: retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; The resulting -EBUSY return value will trigger a BUG_ON() in xip_file_fault. This fix simply considers the fault as fixed in this case, because the race winner has successfully installed the pte. [akpm@linux-foundation.org: use conventional (and consistent) comment layout] Reported-by: David Sadler Signed-off-by: Carsten Otte Reported-by: Louis Alex Eisner Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f91b2f687343..a4eb31132229 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -263,7 +263,12 @@ found: xip_pfn); if (err == -ENOMEM) return VM_FAULT_OOM; - BUG_ON(err); + /* + * err == -EBUSY is fine, we've raced against another thread + * that faulted-in the same page + */ + if (err != -EBUSY) + BUG_ON(err); return VM_FAULT_NOPAGE; } else { int err, ret = VM_FAULT_OOM; -- cgit v1.2.3 From 3deaa7190a8da38453c4fabd9dec7f66d17fff67 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 3 Feb 2012 15:37:17 -0800 Subject: readahead: fix pipeline break caused by block plug Herbert Poetzl reported a performance regression since 2.6.39. The test is a simple dd read, but with big block size. The reason is: T1: ra (A, A+128k), (A+128k, A+256k) T2: lock_page for page A, submit the 256k T3: hit page A+128K, ra (A+256k, A+384). the range isn't submitted because of plug and there isn't any lock_page till we hit page A+256k because all pages from A to A+256k is in memory T4: hit page A+256k, ra (A+384, A+ 512). Because of plug, the range isn't submitted again. T5: lock_page A+256k, so (A+256k, A+512k) will be submitted. The task is waitting for (A+256k, A+512k) finish. There is no request to disk in T3 and T4, so readahead pipeline breaks. We really don't need block plug for generic_file_aio_read() for buffered I/O. The readahead already has plug and has fine grained control when I/O should be submitted. Deleting plug for buffered I/O fixes the regression. One side effect is plug makes the request size 256k, the size is 128k without it. This is because default ra size is 128k and not a reason we need plug here. Vivek said: : We submit some readahead IO to device request queue but because of nested : plug, queue never gets unplugged. When read logic reaches a page which is : not in page cache, it waits for page to be read from the disk : (lock_page_killable()) and that time we flush the plug list. : : So effectively read ahead logic is kind of broken in parts because of : nested plugging. Removing top level plug (generic_file_aio_read()) for : buffered reads, will allow unplugging queue earlier for readahead. Signed-off-by: Shaohua Li Signed-off-by: Wu Fengguang Reported-by: Herbert Poetzl Tested-by: Eric Dumazet Cc: Christoph Hellwig Cc: Jens Axboe Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 97f49ed35bd2..b66275757c28 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1400,15 +1400,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; - struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; - blk_start_plug(&plug); - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1424,8 +1421,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { + struct blk_plug plug; + + blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); + blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -1481,7 +1482,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: - blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); -- cgit v1.2.3 From 0bf380bc70ecba68cb4d74dc656cc2fa8c4d801a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 3 Feb 2012 15:37:18 -0800 Subject: mm: compaction: check pfn_valid when entering a new MAX_ORDER_NR_PAGES block during isolation for migration When isolating for migration, migration starts at the start of a zone which is not necessarily pageblock aligned. Further, it stops isolating when COMPACT_CLUSTER_MAX pages are isolated so migrate_pfn is generally not aligned. This allows isolate_migratepages() to call pfn_to_page() on an invalid PFN which can result in a crash. This was originally reported against a 3.0-based kernel with the following trace in a crash dump. PID: 9902 TASK: d47aecd0 CPU: 0 COMMAND: "memcg_process_s" #0 [d72d3ad0] crash_kexec at c028cfdb #1 [d72d3b24] oops_end at c05c5322 #2 [d72d3b38] __bad_area_nosemaphore at c0227e60 #3 [d72d3bec] bad_area at c0227fb6 #4 [d72d3c00] do_page_fault at c05c72ec #5 [d72d3c80] error_code (via page_fault) at c05c47a4 EAX: 00000000 EBX: 000c0000 ECX: 00000001 EDX: 00000807 EBP: 000c0000 DS: 007b ESI: 00000001 ES: 007b EDI: f3000a80 GS: 6f50 CS: 0060 EIP: c030b15a ERR: ffffffff EFLAGS: 00010002 #6 [d72d3cb4] isolate_migratepages at c030b15a #7 [d72d3d14] zone_watermark_ok at c02d26cb #8 [d72d3d2c] compact_zone at c030b8de #9 [d72d3d68] compact_zone_order at c030bba1 #10 [d72d3db4] try_to_compact_pages at c030bc84 #11 [d72d3ddc] __alloc_pages_direct_compact at c02d61e7 #12 [d72d3e08] __alloc_pages_slowpath at c02d66c7 #13 [d72d3e78] __alloc_pages_nodemask at c02d6a97 #14 [d72d3eb8] alloc_pages_vma at c030a845 #15 [d72d3ed4] do_huge_pmd_anonymous_page at c03178eb #16 [d72d3f00] handle_mm_fault at c02f36c6 #17 [d72d3f30] do_page_fault at c05c70ed #18 [d72d3fb0] error_code (via page_fault) at c05c47a4 EAX: b71ff000 EBX: 00000001 ECX: 00001600 EDX: 00000431 DS: 007b ESI: 08048950 ES: 007b EDI: bfaa3788 SS: 007b ESP: bfaa36e0 EBP: bfaa3828 GS: 6f50 CS: 0073 EIP: 080487c8 ERR: ffffffff EFLAGS: 00010202 It was also reported by Herbert van den Bergh against 3.1-based kernel with the following snippet from the console log. BUG: unable to handle kernel paging request at 01c00008 IP: [] isolate_migratepages+0x119/0x390 *pdpt = 000000002f7ce001 *pde = 0000000000000000 It is expected that it also affects 3.2.x and current mainline. The problem is that pfn_valid is only called on the first PFN being checked and that PFN is not necessarily aligned. Lets say we have a case like this H = MAX_ORDER_NR_PAGES boundary | = pageblock boundary m = cc->migrate_pfn f = cc->free_pfn o = memory hole H------|------H------|----m-Hoooooo|ooooooH-f----|------H The migrate_pfn is just below a memory hole and the free scanner is beyond the hole. When isolate_migratepages started, it scans from migrate_pfn to migrate_pfn+pageblock_nr_pages which is now in a memory hole. It checks pfn_valid() on the first PFN but then scans into the hole where there are not necessarily valid struct pages. This patch ensures that isolate_migratepages calls pfn_valid when necessary. Reported-by: Herbert van den Bergh Tested-by: Herbert van den Bergh Signed-off-by: Mel Gorman Acked-by: Michal Nazarewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 71a58f67f481..bd939a574b84 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -313,6 +313,19 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } else if (!locked) spin_lock_irq(&zone->lru_lock); + /* + * migrate_pfn does not necessarily start aligned to a + * pageblock. Ensure that pfn_valid is called when moving + * into a new MAX_ORDER_NR_PAGES range in case of large + * memory holes within the zone + */ + if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(low_pfn)) { + low_pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + } + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; -- cgit v1.2.3 From dc9086004b3d5db75997a645b3fe08d9138b7ad0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 8 Feb 2012 17:13:38 -0800 Subject: mm: compaction: check for overlapping nodes during isolation for migration When isolating pages for migration, migration starts at the start of a zone while the free scanner starts at the end of the zone. Migration avoids entering a new zone by never going beyond the free scanned. Unfortunately, in very rare cases nodes can overlap. When this happens, migration isolates pages without the LRU lock held, corrupting lists which will trigger errors in reclaim or during page free such as in the following oops BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] free_pcppages_bulk+0xcc/0x450 PGD 1dda554067 PUD 1e1cb58067 PMD 0 Oops: 0000 [#1] SMP CPU 37 Pid: 17088, comm: memcg_process_s Tainted: G X RIP: free_pcppages_bulk+0xcc/0x450 Process memcg_process_s (pid: 17088, threadinfo ffff881c2926e000, task ffff881c2926c0c0) Call Trace: free_hot_cold_page+0x17e/0x1f0 __pagevec_free+0x90/0xb0 release_pages+0x22a/0x260 pagevec_lru_move_fn+0xf3/0x110 putback_lru_page+0x66/0xe0 unmap_and_move+0x156/0x180 migrate_pages+0x9e/0x1b0 compact_zone+0x1f3/0x2f0 compact_zone_order+0xa2/0xe0 try_to_compact_pages+0xdf/0x110 __alloc_pages_direct_compact+0xee/0x1c0 __alloc_pages_slowpath+0x370/0x830 __alloc_pages_nodemask+0x1b1/0x1c0 alloc_pages_vma+0x9b/0x160 do_huge_pmd_anonymous_page+0x160/0x270 do_page_fault+0x207/0x4c0 page_fault+0x25/0x30 The "X" in the taint flag means that external modules were loaded but but is unrelated to the bug triggering. The real problem was because the PFN layout looks like this Zone PFN ranges: DMA 0x00000010 -> 0x00001000 DMA32 0x00001000 -> 0x00100000 Normal 0x00100000 -> 0x01e80000 Movable zone start PFN for each node early_node_map[14] active PFN ranges 0: 0x00000010 -> 0x0000009b 0: 0x00000100 -> 0x0007a1ec 0: 0x0007a354 -> 0x0007a379 0: 0x0007f7ff -> 0x0007f800 0: 0x00100000 -> 0x00680000 1: 0x00680000 -> 0x00e80000 0: 0x00e80000 -> 0x01080000 1: 0x01080000 -> 0x01280000 0: 0x01280000 -> 0x01480000 1: 0x01480000 -> 0x01680000 0: 0x01680000 -> 0x01880000 1: 0x01880000 -> 0x01a80000 0: 0x01a80000 -> 0x01c80000 1: 0x01c80000 -> 0x01e80000 The fix is straight-forward. isolate_migratepages() has to make a similar check to isolate_freepage to ensure that it never isolates pages from a zone it does not hold the LRU lock for. This was discovered in a 3.0-based kernel but it affects 3.1.x, 3.2.x and current mainline. Signed-off-by: Mel Gorman Acked-by: Michal Nazarewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index bd939a574b84..d9ebebe1a2aa 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -330,8 +330,17 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; nr_scanned++; - /* Get the page and skip if free */ + /* + * Get the page and ensure the page is within the same zone. + * See the comment in isolate_freepages about overlapping + * nodes. It is deliberate that the new zone lock is not taken + * as memory compaction should not move pages between nodes. + */ page = pfn_to_page(low_pfn); + if (page_zone(page) != zone) + continue; + + /* Skip if free */ if (PageBuddy(page)) continue; -- cgit v1.2.3 From b9980cdcf2524c5fe15d8cbae9c97b3ed6385563 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 8 Feb 2012 17:13:40 -0800 Subject: mm: fix UP THP spin_is_locked BUGs Fix CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_SMP=n CONFIG_DEBUG_VM=y CONFIG_DEBUG_SPINLOCK=n kernel: spin_is_locked() is then always false, and so triggers some BUGs in Transparent HugePage codepaths. asm-generic/bug.h mentions this problem, and provides a WARN_ON_SMP(x); but being too lazy to add VM_BUG_ON_SMP, BUG_ON_SMP, WARN_ON_SMP_ONCE, VM_WARN_ON_SMP_ONCE, just test NR_CPUS != 1 in the existing VM_BUG_ONs. Signed-off-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- mm/swap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b3ffc21ce801..91d3efb25d15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2083,7 +2083,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -2113,7 +2113,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; diff --git a/mm/swap.c b/mm/swap.c index b0f529b38979..fff1ff7fb9ad 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -659,7 +659,7 @@ void lru_add_page_tail(struct zone* zone, VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); VM_BUG_ON(PageLRU(page_tail)); - VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); SetPageLRU(page_tail); -- cgit v1.2.3