From 623e3db9f9b7d6e7b2a99180f9cf0825c936ab7a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 28 Mar 2012 14:42:40 -0700 Subject: mm for fs: add truncate_pagecache_range() Holepunching filesystems ext4 and xfs are using truncate_inode_pages_range but forgetting to unmap pages first (ocfs2 remembers). This is not really a bug, since races already require truncate_inode_page() to handle that case once the page is locked; but it can be very inefficient if the file being punched happens to be mapped into many vmas. Provide a drop-in replacement truncate_pagecache_range() which does the unmapping pass first, handling the awkward mismatch between arguments to truncate_inode_pages_range() and arguments to unmap_mapping_range(). Note that holepunching does not unmap privately COWed pages in the range: POSIX requires that we do so when truncating, but it's hard to justify, difficult to implement without an i_size cutoff, and no filesystem is attempting to implement it. Signed-off-by: Hugh Dickins Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Mark Fasheh Cc: Joel Becker Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Cc: Dave Chinner Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 18aded3a89fc..61a183b89df6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) return 0; } + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first; and truncate_inode_pages_range + * currently BUGs if lend is not pagealigned-1 (it handles partial + * page at start of hole, but not partial page at end of hole). Note + * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); -- cgit v1.2.3 From 45f83cefe3a5f0476ac3f96382ebfdc3fe4caab2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 28 Mar 2012 14:42:40 -0700 Subject: mm: thp: fix up pmd_trans_unstable() locations pmd_trans_unstable() should be called before pmd_offset_map() in the locations where the mmap_sem is held for reading. Signed-off-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: Larry Woodman Cc: Ulrich Obergfell Cc: Rik van Riel Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 ++--- mm/memcontrol.c | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9694cc283511..c283832d411d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -781,9 +781,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); - if (pmd_trans_unstable(pmd)) - return 0; - /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); spin_lock(&walk->mm->page_table_lock); @@ -802,6 +799,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } + if (pmd_trans_unstable(pmd)) + return 0; for (; addr != end; addr += PAGE_SIZE) { /* check to see if we've left 'vma' behind diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2ee6df0e9bb..7d698df4a067 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (get_mctgt_type(vma, addr, *pte, NULL)) @@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { -- cgit v1.2.3 From 29fd66d289f2981e11c550f8b411a6d3d38be0cf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 28 Mar 2012 14:42:41 -0700 Subject: mm, coredump: fail allocations when coredumping instead of oom killing The size of coredump files is limited by RLIMIT_CORE, however, allocating large amounts of memory results in three negative consequences: - the coredumping process may be chosen for oom kill and quickly deplete all memory reserves in oom conditions preventing further progress from being made or tasks from exiting, - the coredumping process may cause other processes to be oom killed without fault of their own as the result of a SIGSEGV, for example, in the coredumping process, or - the coredumping process may result in a livelock while writing to the dump file if it needs memory to allocate while other threads are in the exit path waiting on the coredumper to complete. This is fixed by implying __GFP_NORETRY in the page allocator for coredumping processes when reclaim has failed so the allocations fail and the process continues to exit. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caea788628e4..c313afcc8e5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2308,6 +2308,10 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; + /* Coredumps can quickly deplete all memory reserves */ + if ((current->flags & PF_DUMPCORE) && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, -- cgit v1.2.3 From d15cab975459fb6092eeba1be72c13621337784f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 28 Mar 2012 14:42:42 -0700 Subject: swapon: check validity of swap_flags Most system calls taking flags first check that the flags passed in are valid, and that helps userspace to detect when new flags are supported. But swapon never did so: start checking now, to help if we ever want to support more swap_flags in future. It's difficult to get stray bits set in an int, and swapon is not widely used, so this is most unlikely to break any userspace; but we can just revert if it turns out to do so. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 +++ mm/swapfile.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index b86b5c20617d..8dc0ea7caf02 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -21,6 +21,9 @@ struct bio; #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ +#define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ + SWAP_FLAG_DISCARD) + static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; diff --git a/mm/swapfile.c b/mm/swapfile.c index dae42f380d6e..fafc26d1b1dc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct page *page = NULL; struct inode *inode = NULL; + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3 From a8364d5555b2030d093cde0f07951628e55454e1 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:44 -0700 Subject: slub: only IPI CPUs that have per cpu obj to flush flush_all() is called for each kmem_cache_destroy(). So every cache being destroyed dynamically ends up sending an IPI to each CPU in the system, regardless if the cache has ever been used there. For example, if you close the Infinband ipath driver char device file, the close file ops calls kmem_cache_destroy(). So running some infiniband config tool on one a single CPU dedicated to system tasks might interrupt the rest of the 127 CPUs dedicated to some CPU intensive or latency sensitive task. I suspect there is a good chance that every line in the output of "git grep kmem_cache_destroy linux/ | grep '\->'" has a similar scenario. This patch attempts to rectify this issue by sending an IPI to flush the per cpu objects back to the free lists only to CPUs that seem to have such objects. The check which CPU to IPI is racy but we don't care since asking a CPU without per cpu objects to flush does no damage and as far as I can tell the flush_all by itself is racy against allocs on remote CPUs anyway, so if you required the flush_all to be determinstic, you had to arrange for locking regardless. Without this patch the following artificial test case: $ cd /sys/kernel/slab $ for DIR in *; do cat $DIR/alloc_calls > /dev/null; done produces 166 IPIs on an cpuset isolated CPU. With it it produces none. The code path of memory allocation failure for CPUMASK_OFFSTACK=y config was tested using fault injection framework. Signed-off-by: Gilad Ben-Yossef Acked-by: Christoph Lameter Cc: Chris Metcalf Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Sasha Levin Cc: Rik van Riel Cc: Andi Kleen Cc: Mel Gorman Cc: Alexander Viro Cc: Avi Kivity Cc: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f4a6229848fd..dcbb1926cb7f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2028,9 +2028,17 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return !!(c->page); +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* -- cgit v1.2.3 From 74046494ea68676d29ef6501a4bd950f08112a2c Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:45 -0700 Subject: mm: only IPI CPUs to drain local pages if they exist Calculate a cpumask of CPUs with per-cpu pages in any zone and only send an IPI requesting CPUs to drain these pages to the buddy allocator if they actually have pages when asked to flush. This patch saves 85%+ of IPIs asking to drain per-cpu pages in case of severe memory pressure that leads to OOM since in these cases multiple, possibly concurrent, allocation requests end up in the direct reclaim code path so when the per-cpu pages end up reclaimed on first allocation failure for most of the proceeding allocation attempts until the memory pressure is off (possibly via the OOM killer) there are no per-cpu pages on most CPUs (and there can easily be hundreds of them). This also has the side effect of shortening the average latency of direct reclaim by 1 or more order of magnitude since waiting for all the CPUs to ACK the IPI takes a long time. Tested by running "hackbench 400" on a 8 CPU x86 VM and observing the difference between the number of direct reclaim attempts that end up in drain_all_pages() and those were more then 1/2 of the online CPU had any per-cpu page in them, using the vmstat counters introduced in the next patch in the series and using proc/interrupts. In the test sceanrio, this was seen to save around 3600 global IPIs after trigerring an OOM on a concurrent workload: $ cat /proc/vmstat | tail -n 2 pcp_global_drain 0 pcp_global_ipi_saved 0 $ cat /proc/interrupts | grep CAL CAL: 1 2 1 2 2 2 2 2 Function call interrupts $ hackbench 400 [OOM messages snipped] $ cat /proc/vmstat | tail -n 2 pcp_global_drain 3647 pcp_global_ipi_saved 3642 $ cat /proc/interrupts | grep CAL CAL: 6 13 6 3 3 3 1 2 7 Function call interrupts Please note that if the global drain is removed from the direct reclaim path as a patch from Mel Gorman currently suggests this should be replaced with an on_each_cpu_cond invocation. Signed-off-by: Gilad Ben-Yossef Acked-by: Mel Gorman Cc: KOSAKI Motohiro Acked-by: Christoph Lameter Acked-by: Peter Zijlstra Cc: Pekka Enberg Cc: Rik van Riel Cc: Andi Kleen Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c313afcc8e5a..a712fb9e04ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 1); + int cpu; + struct per_cpu_pageset *pcp; + struct zone *zone; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + bool has_pcps = false; + for_each_populated_zone(zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION -- cgit v1.2.3 From 0fc9d1040313047edf6a39fd4d7c7defdca97c62 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 28 Mar 2012 14:42:54 -0700 Subject: radix-tree: use iterators in find_get_pages* functions Replace radix_tree_gang_lookup_slot() and radix_tree_gang_lookup_tag_slot() in page-cache lookup functions with brand-new radix-tree direct iterating. This avoids the double-scanning and pointer copying. Iterator don't stop after nr_pages page-get fails in a row, it continue lookup till the radix-tree end. Thus we can safely remove these restart conditions. Unfortunately, old implementation didn't forbid nr_pages == 0, this corner case does not fit into new code, so the patch adds an extra check at the beginning. Signed-off-by: Konstantin Khlebnikov Tested-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 86 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index c3811bc6b9e3..79c4b2b0b14e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found, nr_skip; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, start, nr_pages); - ret = 0; - nr_skip = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -837,7 +836,7 @@ repeat: * when entry at index 0 moves out of or back * to root: none yet gotten, safe to restart. */ - WARN_ON(start | i); + WARN_ON(iter.index); goto restart; } /* @@ -845,7 +844,6 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ - nr_skip++; continue; } @@ -853,21 +851,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found > nr_skip)) - goto restart; rcu_read_unlock(); return ret; } @@ -887,21 +880,22 @@ repeat: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, index, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ if (unlikely(!page)) - continue; + break; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -924,7 +918,7 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } @@ -934,14 +928,14 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != index) { + if (page->mapping == NULL || page->index != iter.index) { page_cache_release(page); break; } pages[ret] = page; - ret++; - index++; + if (++ret == nr_pages) + break; } rcu_read_unlock(); return ret; @@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, - (void ***)pages, *index, nr_pages, tag); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -998,21 +993,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); if (ret) -- cgit v1.2.3