From d88c0922fa0e2c021a028b310a641126c6d4b7dc Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 2 Nov 2010 13:05:18 -0700 Subject: Release page reference during page fault retry This slipped by when unifying the filemap and swap versions of lock_page_or_retry()... Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 75572b5f2374..61ba5e405791 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1563,8 +1563,10 @@ retry_find: goto no_cached_page; } - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); return ret | VM_FAULT_RETRY; + } /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { -- cgit v1.2.3 From ff8b16d7e15a8ba2a6086645614a483e048e3fbf Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 4 Nov 2010 01:56:49 +0800 Subject: vmstat: fix offset calculation on void* Fix regression introduced by commit 79da826aee6 ("writeback: report dirty thresholds in /proc/vmstat"). The incorrect pointer arithmetic can result in problems like this: BUG: unable to handle kernel paging request at 07c06d16 IP: [] strnlen+0x6/0x20 Call Trace: [] ? string+0x39/0xe0 [] ? __wake_up_common+0x4b/0x80 [] ? vsnprintf+0x1ec/0x380 [] ? seq_printf+0x2e/0x60 [] ? vmstat_show+0x26/0x30 [] ? seq_read+0xa6/0x380 [] ? seq_read+0x0/0x380 [] ? proc_reg_read+0x5f/0x90 [] ? vfs_read+0xa1/0x140 [] ? proc_reg_read+0x0/0x90 [] ? sys_read+0x41/0x70 [] ? sysenter_do_call+0x12/0x26 Reported-by: Tetsuo Handa Cc: Michael Rubin Signed-off-by: Wu Fengguang Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index cd2e42be7b68..42eac4d33216 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -949,7 +949,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v[PGPGIN] /= 2; /* sectors -> kbytes */ v[PGPGOUT] /= 2; #endif - return m->private + *pos; + return (unsigned long *)m->private + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -- cgit v1.2.3 From 63bfd7384b119409685a17d5c58f0b56e5dc03da Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 8 Nov 2010 21:29:07 +0200 Subject: perf_events: Fix perf_counter_mmap() hook in mprotect() As pointed out by Linus, commit dab5855 ("perf_counter: Add mmap event hooks to mprotect()") is fundamentally wrong as mprotect_fixup() can free 'vma' due to merging. Fix the problem by moving perf_event_mmap() hook to mprotect_fixup(). Note: there's another successful return path from mprotect_fixup() if old flags equal to new flags. We don't, however, need to call perf_event_mmap() there because 'perf' already knows the VMA is executable. Reported-by: Dave Jones Analyzed-by: Linus Torvalds Cc: Ingo Molnar Reviewed-by: Peter Zijlstra Signed-off-by: Pekka Enberg Signed-off-by: Linus Torvalds --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 2d1bf7cf8851..4c5133873097 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -211,6 +211,7 @@ success: mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + perf_event_mmap(vma); return 0; fail: @@ -299,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) -- cgit v1.2.3 From d2e61b8dc99fdb36e0fd176e25365f69afda4ff9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Nov 2010 14:05:12 -0800 Subject: memcg: null dereference on allocation failure The original code had a null dereference if alloc_percpu() failed. This was introduced in commit 711d3d2c9bc3 ("memcg: cpu hotplug aware percpu count updates") Signed-off-by: Dan Carpenter Reviewed-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a99cfaf0a19..2efa8ea07ff7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4208,15 +4208,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!mem->stat) { - if (size < PAGE_SIZE) - kfree(mem); - else - vfree(mem); - mem = NULL; - } + if (!mem->stat) + goto out_free; spin_lock_init(&mem->pcp_counter_lock); return mem; + +out_free: + if (size < PAGE_SIZE) + kfree(mem); + else + vfree(mem); + return NULL; } /* -- cgit v1.2.3 From 8d056cb965b8fb7c53c564abf28b1962d1061cd3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 11 Nov 2010 14:05:15 -0800 Subject: mm/vfs: revalidate page->mapping in do_generic_file_read() 70 hours into some stress tests of a 2.6.32-based enterprise kernel, we ran into a NULL dereference in here: int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from) { ----> struct inode *inode = page->mapping->host; It looks like page->mapping was the culprit. (xmon trace is below). After closer examination, I realized that do_generic_file_read() does a find_get_page(), and eventually locks the page before calling block_is_partially_uptodate(). However, it doesn't revalidate the page->mapping after the page is locked. So, there's a small window between the find_get_page() and ->is_partially_uptodate() where the page could get truncated and page->mapping cleared. We _have_ a reference, so it can't get reclaimed, but it certainly can be truncated. I think the correct thing is to check page->mapping after the trylock_page(), and jump out if it got truncated. This patch has been running in the test environment for a month or so now, and we have not seen this bug pop up again. xmon info: 1f:mon> e cpu 0x1f: Vector: 300 (Data Access) at [c0000002ae36f770] pc: c0000000001e7a6c: .block_is_partially_uptodate+0xc/0x100 lr: c000000000142944: .generic_file_aio_read+0x1e4/0x770 sp: c0000002ae36f9f0 msr: 8000000000009032 dar: 0 dsisr: 40000000 current = 0xc000000378f99e30 paca = 0xc000000000f66300 pid = 21946, comm = bash 1f:mon> r R00 = 0025c0500000006d R16 = 0000000000000000 R01 = c0000002ae36f9f0 R17 = c000000362cd3af0 R02 = c000000000e8cd80 R18 = ffffffffffffffff R03 = c0000000031d0f88 R19 = 0000000000000001 R04 = c0000002ae36fa68 R20 = c0000003bb97b8a0 R05 = 0000000000000000 R21 = c0000002ae36fa68 R06 = 0000000000000000 R22 = 0000000000000000 R07 = 0000000000000001 R23 = c0000002ae36fbb0 R08 = 0000000000000002 R24 = 0000000000000000 R09 = 0000000000000000 R25 = c000000362cd3a80 R10 = 0000000000000000 R26 = 0000000000000002 R11 = c0000000001e7b60 R27 = 0000000000000000 R12 = 0000000042000484 R28 = 0000000000000001 R13 = c000000000f66300 R29 = c0000003bb97b9b8 R14 = 0000000000000001 R30 = c000000000e28a08 R15 = 000000000000ffff R31 = c0000000031d0f88 pc = c0000000001e7a6c .block_is_partially_uptodate+0xc/0x100 lr = c000000000142944 .generic_file_aio_read+0x1e4/0x770 msr = 8000000000009032 cr = 22000488 ctr = c0000000001e7a60 xer = 0000000020000000 trap = 300 dar = 0000000000000000 dsisr = 40000000 1f:mon> t [link register ] c000000000142944 .generic_file_aio_read+0x1e4/0x770 [c0000002ae36f9f0] c000000000142a14 .generic_file_aio_read+0x2b4/0x770 (unreliable) [c0000002ae36fb40] c0000000001b03e4 .do_sync_read+0xd4/0x160 [c0000002ae36fce0] c0000000001b153c .vfs_read+0xec/0x1f0 [c0000002ae36fd80] c0000000001b1768 .SyS_read+0x58/0xb0 [c0000002ae36fe30] c00000000000852c syscall_exit+0x0/0x40 --- Exception: c00 (System Call) at 00000080a840bc54 SP (fffca15df30) is in userspace 1f:mon> di c0000000001e7a6c c0000000001e7a6c e9290000 ld r9,0(r9) c0000000001e7a70 418200c0 beq c0000000001e7b30 # .block_is_partially_uptodate+0xd0/0x100 c0000000001e7a74 e9440008 ld r10,8(r4) c0000000001e7a78 78a80020 clrldi r8,r5,32 c0000000001e7a7c 3c000001 lis r0,1 c0000000001e7a80 812900a8 lwz r9,168(r9) c0000000001e7a84 39600001 li r11,1 c0000000001e7a88 7c080050 subf r0,r8,r0 c0000000001e7a8c 7f805040 cmplw cr7,r0,r10 c0000000001e7a90 7d6b4830 slw r11,r11,r9 c0000000001e7a94 796b0020 clrldi r11,r11,32 c0000000001e7a98 419d00a8 bgt cr7,c0000000001e7b40 # .block_is_partially_uptodate+0xe0/0x100 c0000000001e7a9c 7fa55840 cmpld cr7,r5,r11 c0000000001e7aa0 7d004214 add r8,r0,r8 c0000000001e7aa4 79080020 clrldi r8,r8,32 c0000000001e7aa8 419c0078 blt cr7,c0000000001e7b20 # .block_is_partially_uptodate+0xc0/0x100 Signed-off-by: Dave Hansen Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Acked-by: Rik van Riel Cc: Cc: Cc: Christoph Hellwig Cc: Al Viro Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 61ba5e405791..4ee2e998e937 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1029,6 +1029,9 @@ find_page: goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) goto page_not_up_to_date_locked; -- cgit v1.2.3 From 1dce071e18b7264457d17c0dec4c7e430bfaee7d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Nov 2010 14:05:17 -0800 Subject: vmscan: avoid setting zone congested if no page dirty nr_dirty and nr_congested are increased only when the page is dirty. So if all pages are clean, both them will be zero. In this case, we should not mark the zone congested. Signed-off-by: Shaohua Li Reviewed-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b8a6fdc21312..d31d7ce52c0e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -913,7 +913,7 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty == nr_congested) + if (nr_dirty == nr_congested && nr_dirty != 0) zone_set_flag(zone, ZONE_CONGESTED); free_page_list(&free_pages); -- cgit v1.2.3 From 27d20fddc8af539464fc3ba499d6a830054c3bd6 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 11 Nov 2010 14:05:19 -0800 Subject: radix-tree: fix RCU bug Salman Qazi describes the following radix-tree bug: In the following case, we get can get a deadlock: 0. The radix tree contains two items, one has the index 0. 1. The reader (in this case find_get_pages) takes the rcu_read_lock. 2. The reader acquires slot(s) for item(s) including the index 0 item. 3. The non-zero index item is deleted, and as a consequence the other item is moved to the root of the tree. The place where it used to be is queued for deletion after the readers finish. 3b. The zero item is deleted, removing it from the direct slot, it remains in the rcu-delayed indirect node. 4. The reader looks at the index 0 slot, and finds that the page has 0 ref count 5. The reader looks at it again, hoping that the item will either be freed or the ref count will increase. This never happens, as the slot it is looking at will never be updated. Also, this slot can never be reclaimed because the reader is holding rcu_read_lock and is in an infinite loop. The fix is to re-use the same "indirect" pointer case that requires a slot lookup retry into a general "retry the lookup" bit. Signed-off-by: Nick Piggin Reported-by: Salman Qazi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 39 +++++++++++++--------- lib/radix-tree.c | 83 ++++++++++++++++++++++++++++++++-------------- mm/filemap.c | 26 ++++++--------- 3 files changed, 91 insertions(+), 57 deletions(-) (limited to 'mm') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index a39cbed9ee17..ab2baa5c4884 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -34,19 +34,13 @@ * needed for RCU lookups (because root->height is unreliable). The only * time callers need worry about this is when doing a lookup_slot under * RCU. + * + * Indirect pointer in fact is also used to tag the last pointer of a node + * when it is shrunk, before we rcu free the node. See shrink code for + * details. */ #define RADIX_TREE_INDIRECT_PTR 1 -#define RADIX_TREE_RETRY ((void *)-1UL) - -static inline void *radix_tree_ptr_to_indirect(void *ptr) -{ - return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); -} -static inline void *radix_tree_indirect_to_ptr(void *ptr) -{ - return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); -} #define radix_tree_indirect_to_ptr(ptr) \ radix_tree_indirect_to_ptr((void __force *)(ptr)) @@ -140,16 +134,29 @@ do { \ * removed. * * For use with radix_tree_lookup_slot(). Caller must hold tree at least read - * locked across slot lookup and dereference. More likely, will be used with - * radix_tree_replace_slot(), as well, so caller will hold tree write locked. + * locked across slot lookup and dereference. Not required if write lock is + * held (ie. items cannot be concurrently inserted). + * + * radix_tree_deref_retry must be used to confirm validity of the pointer if + * only the read lock is held. */ static inline void *radix_tree_deref_slot(void **pslot) { - void *ret = rcu_dereference(*pslot); - if (unlikely(radix_tree_is_indirect_ptr(ret))) - ret = RADIX_TREE_RETRY; - return ret; + return rcu_dereference(*pslot); } + +/** + * radix_tree_deref_retry - check radix_tree_deref_slot + * @arg: pointer returned by radix_tree_deref_slot + * Returns: 0 if retry is not required, otherwise retry is required + * + * radix_tree_deref_retry must be used with radix_tree_deref_slot. + */ +static inline int radix_tree_deref_retry(void *arg) +{ + return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR); +} + /** * radix_tree_replace_slot - replace item in a slot * @pslot: pointer to slot, returned by radix_tree_lookup_slot diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 6f412ab4c24f..5086bb962b4d 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -82,6 +82,16 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline void *ptr_to_indirect(void *ptr) +{ + return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); +} + +static inline void *indirect_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); +} + static inline gfp_t root_gfp_mask(struct radix_tree_root *root) { return root->gfp_mask & __GFP_BITS_MASK; @@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) return -ENOMEM; /* Increase the height. */ - node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); + node->slots[0] = indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { @@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) newheight = root->height+1; node->height = newheight; node->count = 1; - node = radix_tree_ptr_to_indirect(node); + node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; } while (height > root->height); @@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, return error; } - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - rcu_assign_pointer(root->rnode, - radix_tree_ptr_to_indirect(slot)); + rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } /* Go a level down */ @@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, return NULL; return is_slot ? (void *)&root->rnode : node; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot:node; + return is_slot ? (void *)slot : indirect_to_ptr(node); } /** @@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, height = root->height; BUG_ON(index > radix_tree_maxindex(height)); - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { @@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(node)) return (index == 0); - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, } shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); /* * we fill the path from (root->height - 2) to 0, leaving the index at @@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference_raw(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference_raw(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) void *newptr; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); - to_free = radix_tree_indirect_to_ptr(to_free); + to_free = indirect_to_ptr(to_free); /* * The candidate node has more than one child, or its child @@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another. If - * it was safe to dereference the old pointer to it + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it * (to_free->slots[0]), it will be safe to dereference the new - * one (root->rnode). + * one (root->rnode) as far as dependent read barriers go. */ newptr = to_free->slots[0]; if (root->height > 1) - newptr = radix_tree_ptr_to_indirect(newptr); + newptr = ptr_to_indirect(newptr); root->rnode = newptr; root->height--; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page is 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (root->height == 0) + *((unsigned long *)&to_free->slots[0]) |= + RADIX_TREE_INDIRECT_PTR; + radix_tree_node_free(to_free); } } @@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) root->rnode = NULL; goto out; } - slot = radix_tree_indirect_to_ptr(slot); + slot = indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; @@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_node_free(to_free); if (pathp->node->count) { - if (pathp->node == - radix_tree_indirect_to_ptr(root->rnode)) + if (pathp->node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } diff --git a/mm/filemap.c b/mm/filemap.c index 4ee2e998e937..ea89840fc65f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -644,7 +644,9 @@ repeat: pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); if (pagep) { page = radix_tree_deref_slot(pagep); - if (unlikely(!page || page == RADIX_TREE_RETRY)) + if (unlikely(!page)) + goto out; + if (radix_tree_deref_retry(page)) goto repeat; if (!page_cache_get_speculative(page)) @@ -660,6 +662,7 @@ repeat: goto repeat; } } +out: rcu_read_unlock(); return page; @@ -777,12 +780,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) { + if (ret) + start = pages[ret-1]->index; goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -830,11 +832,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (page->mapping == NULL || page->index != index) @@ -887,11 +885,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (!page_cache_get_speculative(page)) -- cgit v1.2.3 From 68cee4f118c21a1c67e5764a91d766661db5b360 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 28 Oct 2010 13:50:37 +0400 Subject: slub: Fix slub_lock down/up imbalance There are two places, that do not release the slub_lock. Respective bugs were introduced by sysfs changes ab4d5ed5 (slub: Enable sysfs support for !CONFIG_SLUB_DEBUG) and 2bce6485 ( slub: Allow removal of slab caches during boot). Acked-by: Christoph Lameter Signed-off-by: Pavel Emelyanov Signed-off-by: Pekka Enberg --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 8fd5401bb071..981fb730aa04 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3273,9 +3273,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, kfree(n); kfree(s); } +err: up_write(&slub_lock); -err: if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -3862,6 +3862,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif + up_read(&slub_lock); kfree(nodes); return x + sprintf(buf + x, "\n"); } -- cgit v1.2.3 From 04c3496152394d17e3bc2316f9731ee3e8a026bc Mon Sep 17 00:00:00 2001 From: "Steven J. Magnani" Date: Wed, 24 Nov 2010 12:56:54 -0800 Subject: nommu: yield CPU while disposing VM Depending on processor speed, page size, and the amount of memory a process is allowed to amass, cleanup of a large VM may freeze the system for many seconds. This can result in a watchdog timeout. Make sure other tasks receive some service when cleaning up large VMs. Signed-off-by: Steven J. Magnani Cc: Greg Ungerer Reviewed-by: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 3613517c7592..27a9ac588516 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1717,6 +1717,7 @@ void exit_mmap(struct mm_struct *mm) mm->mmap = vma->vm_next; delete_vma_from_mm(vma); delete_vma(mm, vma); + cond_resched(); } kleave(""); -- cgit v1.2.3 From 112bc2e120a94a511858918d6866a4978f9c500e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Nov 2010 12:56:58 -0800 Subject: memcg: fix false positive VM_BUG on non-SMP Fix this: kernel BUG at mm/memcontrol.c:2155! invalid opcode: 0000 [#1] last sysfs file: Pid: 18, comm: sh Not tainted 2.6.37-rc3 #3 /Bochs EIP: 0060:[] EFLAGS: 00000246 CPU: 0 EIP is at mem_cgroup_move_account+0xe2/0xf0 EAX: 00000004 EBX: c6f931d4 ECX: c681c300 EDX: c681c000 ESI: c681c300 EDI: ffffffea EBP: c681c000 ESP: c46f3e30 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 Process sh (pid: 18, ti=c46f2000 task=c6826e60 task.ti=c46f2000) Stack: 00000155 c681c000 0805f000 c46ee180 c46f3e5c c7058820 c1074d37 00000000 08060000 c46db9a0 c46ec080 c7058820 0805f000 08060000 c46f3e98 c1074c50 c106c75e c46f3e98 c46ec080 08060000 0805ffff c46db9a0 c46f3e98 c46e0340 Call Trace: [] ? mem_cgroup_move_charge_pte_range+0xe7/0x130 [] ? mem_cgroup_move_charge_pte_range+0x0/0x130 [] ? walk_page_range+0xee/0x1d0 [] ? mem_cgroup_move_task+0x66/0x90 [] ? mem_cgroup_move_charge_pte_range+0x0/0x130 [] ? mem_cgroup_move_task+0x0/0x90 [] ? cgroup_attach_task+0x136/0x200 [] ? cgroup_tasks_write+0x48/0xc0 [] ? cgroup_file_write+0xde/0x220 [] ? do_page_fault+0x17d/0x3f0 [] ? alloc_fd+0x2d/0xd0 [] ? cgroup_file_write+0x0/0x220 [] ? vfs_write+0x92/0xc0 [] ? sys_write+0x41/0x70 [] ? syscall_call+0x7/0xb Code: 03 00 74 09 8b 44 24 04 e8 1c f1 ff ff 89 73 04 8d 86 b0 00 00 00 b9 01 00 00 00 89 da 31 ff e8 65 f5 ff ff e9 4d ff ff ff 0f 0b <0f> 0b 0f 0b 0f 0b 90 8d b4 26 00 00 00 00 83 ec 10 8b 0d f4 e3 EIP: [] mem_cgroup_move_account+0xe2/0xf0 SS:ESP 0068:c46f3e30 ---[ end trace 7daa1582159b6532 ]--- lock_page_cgroup and unlock_page_cgroup are implemented using bit_spinlock. bit_spinlock doesn't touch the bit if we are on non-SMP machine, so we can't use the bit to check whether the lock was taken. Let's introduce is_page_cgroup_locked based on bit_spin_is_locked instead of PageCgroupLocked to fix it. [akpm@linux-foundation.org: s/is_page_cgroup_locked/page_is_cgroup_locked/] Signed-off-by: Kirill A. Shutemov Reviewed-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 7 +++++-- mm/memcontrol.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 5bb13b3db84d..b02195dfc1b0 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -59,8 +59,6 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } -TESTPCGFLAG(Locked, LOCK) - /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -104,6 +102,11 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } +static inline int page_is_cgroup_locked(struct page_cgroup *pc) +{ + return bit_spin_is_locked(PCG_LOCK, &pc->flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea07ff7..62d1880f6992 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2152,7 +2152,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, { VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!PageCgroupLocked(pc)); + VM_BUG_ON(!page_is_cgroup_locked(pc)); VM_BUG_ON(!PageCgroupUsed(pc)); VM_BUG_ON(pc->mem_cgroup != from); -- cgit v1.2.3 From b1dd693e5b9348bd68a80e679e03cf9c0973b01b Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 24 Nov 2010 12:57:06 -0800 Subject: memcg: avoid deadlock between move charge and try_charge() __mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 62d1880f6992..26218df8d19d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -278,13 +278,14 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ + struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } - up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void) mc.moved_swap = 0; } + if (mc.mm) { + up_read(&mc.mm->mmap_sem); + mmput(mc.mm); + } spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); + mc.moving_task = NULL; + mc.mm = NULL; mem_cgroup_end_move(from); memcg_oom_recover(from); memcg_oom_recover(to); @@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { + /* + * We do all the move charge works under one mmap_sem to + * avoid deadlock with down_write(&mmap_sem) + * -> try_charge() -> if (mc.moving_task) -> sleep. + */ + down_read(&mm->mmap_sem); + VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + VM_BUG_ON(mc.mm); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; @@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mc.precharge = 0; mc.moved_charge = 0; mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + mc.moving_task = current; + mc.mm = mm; ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - } - mmput(mm); + /* We call up_read() and mmput() in clear_mc(). */ + } else + mmput(mm); } return ret; } @@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } - up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - struct mm_struct *mm; - - if (!mc.to) + if (!mc.mm) /* no need to move charge */ return; - mm = get_task_mm(p); - if (mm) { - mem_cgroup_move_charge(mm); - mmput(mm); - } + mem_cgroup_move_charge(mc.mm); mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- cgit v1.2.3 From a42c390cfa0c2612459d7226ba11612847ca3a64 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Nov 2010 12:57:08 -0800 Subject: cgroups: make swap accounting default behavior configurable Swap accounting can be configured by CONFIG_CGROUP_MEM_RES_CTLR_SWAP configuration option and then it is turned on by default. There is a boot option (noswapaccount) which can disable this feature. This makes it hard for distributors to enable the configuration option as this feature leads to a bigger memory consumption and this is a no-go for general purpose distribution kernel. On the other hand swap accounting may be very usuful for some workloads. This patch adds a new configuration option which controls the default behavior (CGROUP_MEM_RES_CTLR_SWAP_ENABLED). If the option is selected then the feature is turned on by default. It also adds a new boot parameter swapaccount[=1|0] which enhances the original noswapaccount parameter semantic by means of enable/disable logic (defaults to 1 if no value is provided to be still consistent with noswapaccount). The default behavior is unchanged (if CONFIG_CGROUP_MEM_RES_CTLR_SWAP is enabled then CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is enabled as well) Signed-off-by: Michal Hocko Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++++ init/Kconfig | 13 +++++++++++++ mm/memcontrol.c | 21 +++++++++++++++++++-- 3 files changed, 37 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 92e83e53148f..cdd2a6e8a3b7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2385,6 +2385,11 @@ and is between 256 and 4096 characters. It is defined in the file improve throughput, but will also increase the amount of memory reserved for use by the client. + swapaccount[=0|1] + [KNL] Enable accounting of swap in memory resource + controller if no parameter or 1 is given or disable + it if 0 is given (See Documentation/cgroups/memory.txt) + swiotlb= [IA-64] Number of I/O TLB slabs switches= [HW,M68k] diff --git a/init/Kconfig b/init/Kconfig index 88c10468db46..c9728992a776 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -613,6 +613,19 @@ config CGROUP_MEM_RES_CTLR_SWAP if boot option "noswapaccount" is set, swap will not be accounted. Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config CGROUP_MEM_RES_CTLR_SWAP_ENABLED + bool "Memory Resource Controller Swap Extension enabled by default" + depends on CGROUP_MEM_RES_CTLR_SWAP + default y + help + Memory Resource Controller Swap Extension comes with its price in + a bigger memory consumption. General purpose distribution kernels + which want to enable the feautre but keep it disabled by default + and let the user enable it by swapaccount boot command line + parameter should have this option unselected. + For those who want to have the feature enabled by default should + select this option (if, for some reason, they need to disable it + then noswapaccount does the trick). menuconfig CGROUP_SCHED bool "Group CPU scheduler" diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26218df8d19d..7a22b4129211 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; -static int really_do_swap_account __initdata = 1; /* for remember boot option*/ + +/* for remember boot option*/ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata = 0; +#endif + #else #define do_swap_account (0) #endif @@ -4920,10 +4927,20 @@ struct cgroup_subsys mem_cgroup_subsys = { }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static int __init enable_swap_account(char *s) +{ + /* consider enabled if no parameter or 1 is given */ + if (!s || !strcmp(s, "1")) + really_do_swap_account = 1; + else if (!strcmp(s, "0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - really_do_swap_account = 0; + enable_swap_account("0"); return 1; } __setup("noswapaccount", disable_swap_account); -- cgit v1.2.3 From e9959f0f37160e1f5351af828cc981712b5066c1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 24 Nov 2010 12:57:09 -0800 Subject: mm/page_alloc.c: fix build_all_zonelist() where percpu_alloc() is wrongly called under stop_machine_run() During memory hotplug, build_allzonelists() may be called under stop_machine_run(). In this function, setup_zone_pageset() is called. But it's bug because it will do page allocation under stop_machine_run(). Here is a report from Alok Kataria. BUG: sleeping function called from invalid context at kernel/mutex.c:94 in_atomic(): 0, irqs_disabled(): 1, pid: 4, name: migration/0 Pid: 4, comm: migration/0 Not tainted 2.6.35.6-45.fc14.x86_64 #1 Call Trace: [] __might_sleep+0xeb/0xf0 [] mutex_lock+0x24/0x50 [] pcpu_alloc+0x6d/0x7ee [] ? load_balance+0xbe/0x60e [] ? rt_se_boosted+0x21/0x2f [] ? dequeue_rt_stack+0x18b/0x1ed [] __alloc_percpu+0x10/0x12 [] setup_zone_pageset+0x38/0xbe [] ? build_zonelists_node.clone.58+0x79/0x8c [] __build_all_zonelists+0x419/0x46c [] ? cpu_stopper_thread+0xb2/0x198 [] stop_machine_cpu_stop+0x8e/0xc5 [] ? stop_machine_cpu_stop+0x0/0xc5 [] cpu_stopper_thread+0x108/0x198 [] ? schedule+0x5b2/0x5cc [] ? cpu_stopper_thread+0x0/0x198 [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x87 [] ? kernel_thread_helper+0x0/0x10 Built 5 zonelists in Node order, mobility grouping on. Total pages: 289456 Policy zone: Normal This patch tries to fix the issue by moving setup_zone_pageset() out from stop_machine_run(). It's obviously not necessary to be called under stop_machine_run(). [akpm@linux-foundation.org: remove unneeded local] Reported-by: Alok Kataria Signed-off-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Petr Vandrovec Cc: Pekka Enberg Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..e4092704c1a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3008,14 +3008,6 @@ static __init_refok int __build_all_zonelists(void *data) build_zonelist_cache(pgdat); } -#ifdef CONFIG_MEMORY_HOTPLUG - /* Setup real pagesets for the new zone */ - if (data) { - struct zone *zone = data; - setup_zone_pageset(zone); - } -#endif - /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -3064,7 +3056,11 @@ void build_all_zonelists(void *data) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, data, NULL); +#ifdef CONFIG_MEMORY_HOTPLUG + if (data) + setup_zone_pageset((struct zone *)data); +#endif + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.3 From 5f0af70a25593a9d53b87bc8d31902fb7cc63e40 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 24 Nov 2010 12:57:10 -0800 Subject: mm: remove call to find_vma in pagewalk for non-hugetlbfs Commit d33b9f45 ("mm: hugetlb: fix hugepage memory leak in walk_page_range()") introduces a check if a vma is a hugetlbfs one and later in 5dc37642 ("mm hugetlb: add hugepage support to pagemap") it is moved under #ifdef CONFIG_HUGETLB_PAGE but a needless find_vma call is left behind and its result is not used anywhere else in the function. The side-effect of caching vma for @addr inside walk->mm is neither utilized in walk_page_range() nor in called functions. Signed-off-by: David Sterba Reviewed-by: Naoya Horiguchi Acked-by: Andi Kleen Cc: Andy Whitcroft Cc: David Rientjes Cc: Hugh Dickins Cc: Lee Schermerhorn Cc: Matt Mackall Acked-by: Mel Gorman Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8b1a2ce21ee5..38cc58b8b2b0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -139,7 +139,6 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; - struct vm_area_struct *vma; if (addr >= end) return err; @@ -149,15 +148,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { + struct vm_area_struct *uninitialized_var(vma); + next = pgd_addr_end(addr, end); +#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); -#ifdef CONFIG_HUGETLB_PAGE if (vma && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; -- cgit v1.2.3 From 1f64d69c7ad2e48e697493e45590679f7a69b7b2 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 2 Dec 2010 14:31:12 -0800 Subject: mm/hugetlb.c: avoid double unlock_page() in hugetlb_fault() Have hugetlb_fault() call unlock_page(page) only if it had previously called lock_page(page). Setting CONFIG_DEBUG_VM=y and then running the libhugetlbfs test suite, resulted in the tripping of VM_BUG_ON(!PageLocked(page)) in unlock_page() having been called by hugetlb_fault() when page == pagecache_page. This patch remedied the problem. Signed-off-by: Dean Nelson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c4a3558589ab..85855240933d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2738,7 +2738,8 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } - unlock_page(page); + if (page != pagecache_page) + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); -- cgit v1.2.3 From 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c Mon Sep 17 00:00:00 2001 From: Zeng Zhaoming Date: Thu, 2 Dec 2010 14:31:13 -0800 Subject: mm/mempolicy.c: add rcu read lock to protect pid structure find_task_by_vpid() should be protected by rcu_read_lock(), to prevent free_pid() reclaiming pid. Signed-off-by: Zeng Zhaoming Cc: "Paul E. McKenney" Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4a57f135b76e..11ff260fb282 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1307,15 +1307,18 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out; /* Find the mm_struct */ + rcu_read_lock(); read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -EINVAL; if (!mm) -- cgit v1.2.3 From e172662d113ceb22db727a979bb35b9c02f703b5 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 2 Dec 2010 14:31:13 -0800 Subject: vmstat: fix dirty threshold ordering The nr_dirty_[background_]threshold fields are misplaced before the numa_* fields, and users will read strange values. This is the right order. Before patch, nr_dirty_background_threshold will read as 0 (the value from numa_miss). numa_hit 128501 numa_miss 0 numa_foreign 0 numa_interleave 7388 numa_local 128501 numa_other 0 nr_dirty_threshold 144291 nr_dirty_background_threshold 72145 Signed-off-by: Wu Fengguang Cc: Michael Rubin Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 42eac4d33216..8f62f17ee1c7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -750,8 +750,6 @@ static const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", - "nr_dirty_threshold", - "nr_dirty_background_threshold", #ifdef CONFIG_NUMA "numa_hit", @@ -761,6 +759,8 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_dirty_threshold", + "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", -- cgit v1.2.3 From 64141da587241301ce8638cc945f8b67853156ec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Dec 2010 14:31:18 -0800 Subject: vmalloc: eagerly clear ptes on vunmap On stock 2.6.37-rc4, running: # mount lilith:/export /mnt/lilith # find /mnt/lilith/ -type f -print0 | xargs -0 file crashes the machine fairly quickly under Xen. Often it results in oops messages, but the couple of times I tried just now, it just hung quietly and made Xen print some rude messages: (XEN) mm.c:2389:d80 Bad type (saw 7400000000000001 != exp 3000000000000000) for mfn 1d7058 (pfn 18fa7) (XEN) mm.c:964:d80 Attempt to create linear p.t. with write perms (XEN) mm.c:2389:d80 Bad type (saw 7400000000000010 != exp 1000000000000000) for mfn 1d2e04 (pfn 1d1fb) (XEN) mm.c:2965:d80 Error while pinning mfn 1d2e04 Which means the domain tried to map a pagetable page RW, which would allow it to map arbitrary memory, so Xen stopped it. This is because vm_unmap_ram() left some pages mapped in the vmalloc area after NFS had finished with them, and those pages got recycled as pagetable pages while still having these RW aliases. Removing those mappings immediately removes the Xen-visible aliases, and so it has no problem with those pages being reused as pagetable pages. Deferring the TLB flush doesn't upset Xen because it can flush the TLB itself as needed to maintain its invariants. When unmapping a region in the vmalloc space, clear the ptes immediately. There's no point in deferring this because there's no amortization benefit. The TLBs are left dirty, and they are flushed lazily to amortize the cost of the IPIs. This specific motivation for this patch is an oops-causing regression since 2.6.36 when using NFS under Xen, triggered by the NFS client's use of vm_map_ram() introduced in 56e4ebf877b60 ("NFS: readdir with vmapped pages") . XFS also uses vm_map_ram() and could cause similar problems. Signed-off-by: Jeremy Fitzhardinge Cc: Nick Piggin Cc: Bryan Schumaker Cc: Trond Myklebust Cc: Alex Elder Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/xen/mmu.c | 2 -- include/linux/vmalloc.h | 2 -- mm/vmalloc.c | 28 +++++++++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a1feff9e59b6..44924e551fde 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2415,8 +2415,6 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; - vmap_lazy_unmap = false; - memset(dummy_mapping, 0xff, PAGE_SIZE); } diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a03dcf62ca9d..44b54f619ac6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -7,8 +7,6 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ -extern bool vmap_lazy_unmap; - /* bits in flags of vmalloc's vm_struct below */ #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ #define VM_ALLOC 0x00000002 /* vmalloc() */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3d66b3dc5cb..eb5cc7d00c5a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,8 +31,6 @@ #include #include -bool vmap_lazy_unmap __read_mostly = true; - /*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) @@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void) { unsigned int log; - if (!vmap_lazy_unmap) - return 0; - log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); @@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - unmap_vmap_area(va); list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; @@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) +static void free_vmap_area_noflush(struct vmap_area *va) { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -622,6 +617,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) try_purge_vmap_area_lazy(); } +/* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + unmap_vmap_area(va); + free_vmap_area_noflush(va); +} + /* * Free and unmap a vmap area */ @@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb) spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); - free_unmap_vmap_area_noflush(vb->va); + free_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb); } @@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb); + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + spin_lock(&vb->lock); BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); @@ -988,7 +995,6 @@ void vm_unmap_aliases(void) s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); - vunmap_page_range(s, e); flush = 1; if (s < start) -- cgit v1.2.3 From 20d6c96b5f1cad5c5da4641945ec17a1d9a1afc8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Dec 2010 14:31:19 -0800 Subject: mem-hotplug: introduce {un}lock_memory_hotplug() Presently hwpoison is using lock_system_sleep() to prevent a race with memory hotplug. However lock_system_sleep() is a no-op if CONFIG_HIBERNATION=n. Therefore we need a new lock. Signed-off-by: KOSAKI Motohiro Cc: Andi Kleen Cc: Kamezawa Hiroyuki Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 ++++++ mm/memory-failure.c | 8 ++++---- mm/memory_hotplug.c | 31 ++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4307231bd22f..31c237a00c48 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -161,6 +161,9 @@ extern void register_page_bootmem_info_node(struct pglist_data *pgdat); extern void put_page_bootmem(struct page *page); #endif +void lock_memory_hotplug(void); +void unlock_memory_hotplug(void); + #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -192,6 +195,9 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } +static inline void lock_memory_hotplug(void) {} +static inline void unlock_memory_hotplug(void) {} + #endif /* ! CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 124324134ff6..46ab2c044b0e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1230,11 +1231,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_system_sleep prevents a race with memory hotplug, - * because the isolation assumes there's only a single user. + * The lock_memory_hotplug prevents a race with memory hotplug. * This is a big hammer, a better would be nicer. */ - lock_system_sleep(); + lock_memory_hotplug(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1264,7 +1264,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ret = 1; } unset_migratetype_isolate(p); - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9260314a221e..2c6523af5473 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,23 @@ #include "internal.h" +DEFINE_MUTEX(mem_hotplug_mutex); + +void lock_memory_hotplug(void) +{ + mutex_lock(&mem_hotplug_mutex); + + /* for exclusive hibernation if CONFIG_HIBERNATION=y */ + lock_system_sleep(); +} + +void unlock_memory_hotplug(void) +{ + unlock_system_sleep(); + mutex_unlock(&mem_hotplug_mutex); +} + + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { @@ -493,7 +510,7 @@ int mem_online_node(int nid) pg_data_t *pgdat; int ret; - lock_system_sleep(); + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (pgdat) { ret = -ENOMEM; @@ -504,7 +521,7 @@ int mem_online_node(int nid) BUG_ON(ret); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } @@ -516,7 +533,7 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - lock_system_sleep(); + lock_memory_hotplug(); res = register_memory_resource(start, size); ret = -EEXIST; @@ -563,7 +580,7 @@ error: release_memory_resource(res); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -791,7 +808,7 @@ static int offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_system_sleep(); + lock_memory_hotplug(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -880,7 +897,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_system_sleep(); + unlock_memory_hotplug(); return 0; failed_removal: @@ -891,7 +908,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } -- cgit v1.2.3 From a0b0f58cdd32ab363a600a294ddaa90f0c32de8c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Dec 2010 14:31:20 -0800 Subject: ksm: annotate ksm_thread_mutex is no deadlock source commit 62b61f611e ("ksm: memory hotremove migration only") caused the following new lockdep warning. ======================================================= [ INFO: possible circular locking dependency detected ] ------------------------------------------------------- bash/1621 is trying to acquire lock: ((memory_chain).rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x69/0xc0 but task is already holding lock: (ksm_thread_mutex){+.+.+.}, at: [] ksm_memory_callback+0x3a/0xc0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (ksm_thread_mutex){+.+.+.}: [] lock_acquire+0xaa/0x140 [] __mutex_lock_common+0x44/0x3f0 [] mutex_lock_nested+0x48/0x60 [] ksm_memory_callback+0x3a/0xc0 [] notifier_call_chain+0x8c/0xe0 [] __blocking_notifier_call_chain+0x7e/0xc0 [] blocking_notifier_call_chain+0x16/0x20 [] memory_notify+0x1b/0x20 [] remove_memory+0x1cc/0x5f0 [] memory_block_change_state+0xfd/0x1a0 [] store_mem_state+0xe2/0xf0 [] sysdev_store+0x20/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xc8/0x190 [] sys_write+0x54/0x90 [] system_call_fastpath+0x16/0x1b -> #0 ((memory_chain).rwsem){.+.+.+}: [] __lock_acquire+0x155a/0x1600 [] lock_acquire+0xaa/0x140 [] down_read+0x51/0xa0 [] __blocking_notifier_call_chain+0x69/0xc0 [] blocking_notifier_call_chain+0x16/0x20 [] memory_notify+0x1b/0x20 [] remove_memory+0x56e/0x5f0 [] memory_block_change_state+0xfd/0x1a0 [] store_mem_state+0xe2/0xf0 [] sysdev_store+0x20/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xc8/0x190 [] sys_write+0x54/0x90 [] system_call_fastpath+0x16/0x1b But it's a false positive. Both memory_chain.rwsem and ksm_thread_mutex have an outer lock (mem_hotplug_mutex). So they cannot deadlock. Thus, This patch annotate ksm_thread_mutex is not deadlock source. [akpm@linux-foundation.org: update comment, from Hugh] Signed-off-by: KOSAKI Motohiro Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Andi Kleen Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 65ab5c7067d9..43bc893470b4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1724,8 +1724,13 @@ static int ksm_memory_callback(struct notifier_block *self, /* * Keep it very simple for now: just lock out ksmd and * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take ksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * ksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. */ - mutex_lock(&ksm_thread_mutex); + mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); break; case MEM_OFFLINE: -- cgit v1.2.3 From 37d57443d5d810c6ef49e93586b046e7d4774818 Mon Sep 17 00:00:00 2001 From: Tero Roponen Date: Wed, 1 Dec 2010 20:04:20 +0200 Subject: slub: Fix a crash during slabinfo -v Commit f7cb1933621bce66a77f690776a16fe3ebbc4d58 ("SLUB: Pass active and inactive redzone flags instead of boolean to debug functions") missed two instances of check_object(). This caused a lot of warnings during 'slabinfo -v' finally leading to a crash: BUG ext4_xattr: Freepointer corrupt ... BUG buffer_head: Freepointer corrupt ... BUG ext4_alloc_context: Freepointer corrupt ... ... BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] file_sb_list_del+0x1c/0x35 PGD 79d78067 PUD 79e67067 PMD 0 Oops: 0002 [#1] SMP last sysfs file: /sys/kernel/slab/:t-0000192/validate This patch fixes the problem by converting the two missed instances. Acked-by: Christoph Lameter Signed-off-by: Tero Roponen Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 981fb730aa04..bec0e355fbad 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3401,13 +3401,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } -- cgit v1.2.3 From c9e664f1fdf34aa8cede047b206deaa8f1945af0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2010 22:57:45 +0100 Subject: PM / Hibernate: Fix memory corruption related to swap There is a problem that swap pages allocated before the creation of a hibernation image can be released and used for storing the contents of different memory pages while the image is being saved. Since the kernel stored in the image doesn't know of that, it causes memory corruption to occur after resume from hibernation, especially on systems with relatively small RAM that need to swap often. This issue can be addressed by keeping the GFP_IOFS bits clear in gfp_allowed_mask during the entire hibernation, including the saving of the image, until the system is finally turned off or the hibernation is aborted. Unfortunately, for this purpose it's necessary to rework the way in which the hibernate and suspend code manipulates gfp_allowed_mask. This change is based on an earlier patch from Hugh Dickins. Signed-off-by: Rafael J. Wysocki Reported-by: Ondrej Zary Acked-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: stable@kernel.org --- include/linux/gfp.h | 4 ++-- kernel/power/hibernate.c | 22 ++++++++++++---------- kernel/power/suspend.c | 5 ++--- kernel/power/user.c | 2 ++ mm/page_alloc.c | 19 ++++++++++++------- 5 files changed, 30 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e8713d55360a..f54adfcbec9c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -360,7 +360,7 @@ void drain_local_pages(void *dummy); extern gfp_t gfp_allowed_mask; -extern void set_gfp_allowed_mask(gfp_t mask); -extern gfp_t clear_gfp_allowed_mask(gfp_t mask); +extern void pm_restrict_gfp_mask(void); +extern void pm_restore_gfp_mask(void); #endif /* __LINUX_GFP_H */ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 657272e91d0a..048d0b514831 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -327,7 +327,6 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; - gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode) goto Recover_platform; error = create_image(platform_mode); - /* Control returns here after successful restore */ + /* + * Control returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ Resume_devices: /* We may need to release the preallocated image pages here. */ @@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + resume_console(); Close: platform_end(platform_mode); @@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; - gfp_t saved_mask; pm_prepare_console(); suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); pm_restore_console(); return error; @@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; - gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -492,7 +495,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -536,7 +538,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); resume_console(); Close: @@ -646,6 +647,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee473..ecf770509d0d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; - gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877ca..1b2ea31e6bd8 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; + pm_restore_gfp_mask(); thaw_processes(); usermodehelper_enable(); data->frozen = 0; @@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -EPERM; break; } + pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) error = put_user(in_suspend, (int __user *)arg); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4092704c1a9..ff7e15872398 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; * only be modified with pm_mutex held, unless the suspend/hibernate code is * guaranteed not to run in parallel with that modification). */ -void set_gfp_allowed_mask(gfp_t mask) + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask = mask; + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } } -gfp_t clear_gfp_allowed_mask(gfp_t mask) +void pm_restrict_gfp_mask(void) { - gfp_t ret = gfp_allowed_mask; - WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask &= ~mask; - return ret; + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; } #endif /* CONFIG_PM_SLEEP */ -- cgit v1.2.3