From bd50cfa89153a67429935a15e577a5eb5f10dd1b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Jun 2011 07:18:45 -0400 Subject: slob/lockdep: Fix gfp flags passed to lockdep Doing a ktest.pl randconfig, I stumbled across the following bug on boot up: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/autotest/nobackup/linux-test.git/kernel/lockdep.c:2649 lockdep_trace_alloc+0xed/0x100() Hardware name: Modules linked in: Pid: 0, comm: swapper Not tainted 3.0.0-rc1-test-00054-g1d68b67 #1 Call Trace: [] warn_slowpath_common+0xad/0xf0 [] warn_slowpath_null+0x1a/0x20 [] lockdep_trace_alloc+0xed/0x100 [] __kmalloc_node+0x30/0x2f0 [] pcpu_mem_alloc+0x13a/0x180 [] percpu_init_late+0x48/0xc2 [] ? mem_init+0xd8/0xe3 [] start_kernel+0x1c2/0x449 [] x86_64_start_reservations+0x163/0x167 [] x86_64_start_kernel+0x133/0x142^M ---[ end trace a7919e7f17c0a725 ]--- Then I ran a ktest.pl config_bisect and it came up with this config as the problem: CONFIG_SLOB Looking at what is different between SLOB and SLAB and SLUB, I found that the gfp flags are masked against gfp_allowed_mask in SLAB and SLUB, but not SLOB. On boot up, interrupts are disabled and lockdep will warn if some flags are set in gfp and interrupts are disabled. But these flags are masked off with the gfp_allowed_mask during boot. Because SLOB does not mask the flags against gfp_allowed_mask it triggers the warn on. Adding this mask fixes the bug. I also found that kmem_cache_alloc_node() was missing both the mask and the lockdep check, and that was added too. Acked-by: Matt Mackall Cc: Paul Mundt Cc: Nick Piggin Signed-off-by: Steven Rostedt Signed-off-by: Pekka Enberg --- mm/slob.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 46e0aee33a23..0ae881831ae2 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; + gfp &= gfp_allowed_mask; + lockdep_trace_alloc(gfp); if (size < PAGE_SIZE - align) { @@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, -- cgit v1.2.3 From 6e6938b6d3130305a5960c86b1a9b21e58cf6144 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 6 Jun 2010 10:38:15 -0600 Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync stage sync(2) is performed in two stages: the WB_SYNC_NONE sync and the WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and do livelock prevention for it, too. Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance using page tagging") is a partial fix in that it only fixed the WB_SYNC_ALL phase livelock. Although ext4 is tested to no longer livelock with commit f446daaea9, it may due to some "redirty_tail() after pages_skipped" effect which is by no means a guarantee for _all_ the file systems. Note that writeback_inodes_sb() is called by not only sync(), they are treated the same because the other callers also need livelock prevention. Impact: It changes the order in which pages/inodes are synced to disk. Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode until finished with the current inode. Acked-by: Jan Kara CC: Dave Chinner Signed-off-by: Wu Fengguang --- fs/ext4/inode.c | 4 ++-- fs/fs-writeback.c | 17 +++++++++-------- include/linux/writeback.h | 1 + mm/page-writeback.c | 4 ++-- 4 files changed, 14 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a5763e3505ba..8558b6c3450a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping, index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; @@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping, } retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); while (!ret && wbc->nr_to_write > 0) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0f015a0468de..5ed2ce9a28d0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,6 +36,7 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; @@ -650,6 +651,7 @@ static long wb_writeback(struct bdi_writeback *wb, { struct writeback_control wbc = { .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, .older_than_this = NULL, .for_kupdate = work->for_kupdate, .for_background = work->for_background, @@ -657,7 +659,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; - long write_chunk; + long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; if (wbc.for_kupdate) { @@ -683,9 +685,7 @@ static long wb_writeback(struct bdi_writeback *wb, * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else + if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ @@ -1188,10 +1188,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 17e7ccc322a5..3f6542ca6198 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -47,6 +47,7 @@ struct writeback_control { unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ + unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f698862420..955fe35d01e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -892,12 +892,12 @@ int write_cache_pages(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { -- cgit v1.2.3 From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- fs/block_dev.c | 16 +++++--- fs/fs-writeback.c | 97 +++++++++++++++++++++++---------------------- fs/inode.c | 5 +-- include/linux/backing-dev.h | 2 + include/linux/writeback.h | 2 - mm/backing-dev.c | 21 ++++++++-- mm/filemap.c | 6 +-- mm/rmap.c | 4 +- 8 files changed, 85 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f0..3c9a03e51b62 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 664acdb2e7ef..36a30917e0dc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); } - /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -214,11 +213,9 @@ static void redirty_tail(struct inode *inode) /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } @@ -226,7 +223,7 @@ static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through - * spin_unlock(&inode_wb_list_lock); + * spin_unlock(&wb->list_lock); */ smp_mb(); @@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -317,7 +314,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) /* * Wait for writeback on an inode to complete. */ -static void inode_wait_for_writeback(struct inode *inode) +static void inode_wait_for_writeback(struct inode *inode, + struct bdi_writeback *wb) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -325,15 +323,15 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * Write out an inode's dirty pages. Called under wb->list_lock and * inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. * @@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode) * livelocks, etc. */ static int -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; unsigned dirty; int ret; - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - requeue_io(inode); + requeue_io(inode, wb); return 0; } /* * It's a data-integrity sync. We must wait. */ - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); } BUG_ON(inode->i_state & I_SYNC); @@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); ret = do_writepages(mapping, wbc); @@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * slice used up: queue for next turn */ - requeue_io(inode); + requeue_io(inode, wb); } else { /* * Writeback blocked by something other than @@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * submission or metadata updates after data IO * completion. */ - redirty_tail(inode); + redirty_tail(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * superblock, move all inodes not belonging * to it back onto the dirty list. */ - redirty_tail(inode); + redirty_tail(inode, wb); continue; } @@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode); + requeue_io(inode, wb); continue; } @@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, __iget(inode); pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wbc); + writeback_single_inode(inode, wb, wbc); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode); + redirty_tail(inode, wb); } spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); iput(inode); cond_resched(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, struct super_block *sb = inode->i_sb; if (!pin_sb_for_writeback(sb)) { - requeue_io(inode); + requeue_io(inode, wb); continue; } ret = writeback_sb_inodes(sb, wb, wbc, false); @@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); /* Leave any unwritten inodes on b_io */ } @@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); } /* @@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); } return wrote; @@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } spin_unlock(&inode->i_lock); - spin_lock(&inode_wb_list_lock); + spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); @@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, &wbc); + ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wbc); + ret = writeback_single_inode(inode, wb, wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 0f7e88a7803f..4be128cbc754 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -37,7 +37,7 @@ * inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * inode_wb_list_lock protects: + * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash @@ -48,7 +48,7 @@ * inode->i_lock * inode_lru_lock * - * inode_wb_list_lock + * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock @@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru); static DEFINE_SPINLOCK(inode_lru_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 96f4094b706d..47feb2c4706a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -57,6 +57,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + spinlock_t list_lock; /* protects the b_* lists */ }; struct backing_dev_info { @@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 7df9026f7129..c2d957fb38d3 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,8 +9,6 @@ struct backing_dev_info; -extern spinlock_t inode_wb_list_lock; - /* * fs/fs-writeback.c */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..5f6553ef1ba7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +{ + if (wb1 < wb2) { + spin_lock(&wb1->list_lock); + spin_lock_nested(&wb2->list_lock, 1); + } else { + spin_lock(&wb2->list_lock); + spin_lock_nested(&wb1->list_lock, 1); + } +} + #ifdef CONFIG_DEBUG_FS #include #include @@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + spin_lock_init(&wb->list_lock); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } @@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_wb_list_lock); + bdi_lock_two(&bdi->wb, dst); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); + spin_unlock(&dst->list_lock); } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64b..1e492c3dd6f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -81,7 +81,7 @@ * ->i_mutex * ->i_alloc_sem (various) * - * inode_wb_list_lock + * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * @@ -99,9 +99,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (zap_pte_range->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff --git a/mm/rmap.c b/mm/rmap.c index 0eb463ea88dd..d04e36a7cc9f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_wb_list_lock in __sync_single_inode) + * within bdi.wb->list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock -- cgit v1.2.3 From 6f7186562771ec9b629914df328048449ccddf4a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 2 Mar 2011 17:14:34 -0600 Subject: writeback: add bdi_dirty_limit() kernel-doc Clarify the bdi_dirty_limit() comment. Acked-by: Peter Zijlstra Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 955fe35d01e0..b8be62381396 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -437,10 +437,17 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) *pdirty = dirty; } -/* +/** * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * @bdi: the backing_dev_info to query + * @dirty: global dirty limit in pages + * + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * And the "limit" in the name is not seriously taken as hard limit in + * balance_dirty_pages(). * - * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * -- cgit v1.2.3 From 3efaf0faba6793cd91298c76315e15de59c13ae0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 16 Dec 2010 22:22:00 -0600 Subject: writeback: skip balance_dirty_pages() for in-memory fs This avoids unnecessary checks and dirty throttling on tmpfs/ramfs. Notes about the tmpfs/ramfs behavior changes: As for 2.6.36 and older kernels, the tmpfs writes will sleep inside balance_dirty_pages() as long as we are over the (dirty+background)/2 global throttle threshold. This is because both the dirty pages and threshold will be 0 for tmpfs/ramfs. Hence this test will always evaluate to TRUE: dirty_exceeded = (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) || (nr_reclaimable + nr_writeback >= dirty_thresh); For 2.6.37, someone complained that the current logic does not allow the users to set vm.dirty_ratio=0. So commit 4cbec4c8b9 changed the test to dirty_exceeded = (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) || (nr_reclaimable + nr_writeback > dirty_thresh); So 2.6.37 will behave differently for tmpfs/ramfs: it will never get throttled unless the global dirty threshold is exceeded (which is very unlikely to happen; once happen, will block many tasks). I'd say that the 2.6.36 behavior is very bad for tmpfs/ramfs. It means for a busy writing server, tmpfs write()s may get livelocked! The "inadvertent" throttling can hardly bring help to any workload because of its "either no throttling, or get throttled to death" property. So based on 2.6.37, this patch won't bring more noticeable changes. CC: Hugh Dickins Acked-by: Rik van Riel Acked-by: Peter Zijlstra Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b8be62381396..b2529f8f8be0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -244,13 +244,8 @@ void task_dirty_inc(struct task_struct *tsk) static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - if (bdi_cap_writeback_dirty(bdi)) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + prop_fraction_percpu(&vm_completions, &bdi->completions, numerator, denominator); - } else { - *numerator = 0; - *denominator = 1; - } } static inline void task_dirties_fraction(struct task_struct *tsk, @@ -495,6 +490,9 @@ static void balance_dirty_pages(struct address_space *mapping, bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; + if (!bdi_cap_account_dirty(bdi)) + return; + for (;;) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, -- cgit v1.2.3 From 140a1ef2f91a00e1d25f0878c193abdc25bf6ebe Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Fri, 10 Jun 2011 03:57:26 +0000 Subject: mm Kconfig typo: cleancacne -> cleancache Signed-off-by: Michael Witten Signed-off-by: Jiri Kosina --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 8ca47a5ee9c8..f2f1ca19ed53 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -356,7 +356,7 @@ config CLEANCACHE for clean pages that the kernel's pageframe replacement algorithm (PFRA) would like to keep around, but can't since there isn't enough memory. So when the PFRA "evicts" a page, it first attempts to use - cleancacne code to put the data contained in that page into + cleancache code to put the data contained in that page into "transcendent memory", memory that is not directly accessible or addressable by the kernel and is of unknown and possibly time-varying size. And when a cleancache-enabled -- cgit v1.2.3 From 36715cef0770b7e2547892b7c3197fc024274630 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 17:53:57 -0600 Subject: writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr() This helps prevent tmpfs dirtiers from skewing the per-cpu bdp_ratelimits. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b2529f8f8be0..1965d05a29cc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -490,9 +490,6 @@ static void balance_dirty_pages(struct address_space *mapping, bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; - if (!bdi_cap_account_dirty(bdi)) - return; - for (;;) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, @@ -631,9 +628,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long ratelimit; unsigned long *p; + if (!bdi_cap_account_dirty(bdi)) + return; + ratelimit = ratelimit_pages; if (mapping->backing_dev_info->dirty_exceeded) ratelimit = 8; -- cgit v1.2.3 From d21142ece414ce1088cfcae760689aa60d6fee80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Jun 2011 16:50:34 +0200 Subject: ptrace: kill task_ptrace() task_ptrace(task) simply dereferences task->ptrace and isn't even used consistently only adding confusion. Kill it and directly access ->ptrace instead. This doesn't introduce any behavior change. Signed-off-by: Tejun Heo Signed-off-by: Oleg Nesterov --- include/linux/ptrace.h | 11 ----------- include/linux/tracehook.h | 16 ++++++++-------- kernel/exit.c | 8 ++++---- kernel/signal.c | 14 +++++++------- mm/oom_kill.c | 3 +-- 5 files changed, 20 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 4f224f169524..3ff20b322598 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -145,17 +145,6 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); -/** - * task_ptrace - return %PT_* flags that apply to a task - * @task: pointer to &task_struct in question - * - * Returns the %PT_* flags that apply to @task. - */ -static inline int task_ptrace(struct task_struct *task) -{ - return task->ptrace; -} - /** * ptrace_event - possibly stop for a ptrace event notification * @mask: %PT_* bit to check in @current->ptrace diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 15745cdd32ce..a3e838784f43 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -63,7 +63,7 @@ struct linux_binprm; */ static inline int tracehook_expect_breakpoints(struct task_struct *task) { - return (task_ptrace(task) & PT_PTRACED) != 0; + return (task->ptrace & PT_PTRACED) != 0; } /* @@ -71,7 +71,7 @@ static inline int tracehook_expect_breakpoints(struct task_struct *task) */ static inline void ptrace_report_syscall(struct pt_regs *regs) { - int ptrace = task_ptrace(current); + int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return; @@ -155,7 +155,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) static inline int tracehook_unsafe_exec(struct task_struct *task) { int unsafe = 0; - int ptrace = task_ptrace(task); + int ptrace = task->ptrace; if (ptrace & PT_PTRACED) { if (ptrace & PT_PTRACE_CAP) unsafe |= LSM_UNSAFE_PTRACE_CAP; @@ -178,7 +178,7 @@ static inline int tracehook_unsafe_exec(struct task_struct *task) */ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk) { - if (task_ptrace(tsk) & PT_PTRACED) + if (tsk->ptrace & PT_PTRACED) return rcu_dereference(tsk->parent); return NULL; } @@ -202,7 +202,7 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, struct pt_regs *regs) { if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) && - unlikely(task_ptrace(current) & PT_PTRACED)) + unlikely(current->ptrace & PT_PTRACED)) send_sig(SIGTRAP, current, 0); } @@ -285,7 +285,7 @@ static inline void tracehook_report_clone(struct pt_regs *regs, unsigned long clone_flags, pid_t pid, struct task_struct *child) { - if (unlikely(task_ptrace(child))) { + if (unlikely(child->ptrace)) { /* * It doesn't matter who attached/attaching to this * task, the pending SIGSTOP is right in any case. @@ -403,7 +403,7 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, static inline int tracehook_consider_ignored_signal(struct task_struct *task, int sig) { - return (task_ptrace(task) & PT_PTRACED) != 0; + return (task->ptrace & PT_PTRACED) != 0; } /** @@ -422,7 +422,7 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task, static inline int tracehook_consider_fatal_signal(struct task_struct *task, int sig) { - return (task_ptrace(task) & PT_PTRACED) != 0; + return (task->ptrace & PT_PTRACED) != 0; } #define DEATH_REAP -1 diff --git a/kernel/exit.c b/kernel/exit.c index 289f59d686bf..e5cc05644609 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -765,7 +765,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ - if (!task_ptrace(p) && + if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { do_notify_parent(p, p->exit_signal); if (task_detached(p)) { @@ -795,7 +795,7 @@ static void forget_original_parent(struct task_struct *father) do { t->real_parent = reaper; if (t->parent == father) { - BUG_ON(task_ptrace(t)); + BUG_ON(t->ptrace); t->parent = t->real_parent; } if (t->pdeath_signal) @@ -1565,7 +1565,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Notification and reaping will be cascaded to the real * parent when the ptracer detaches. */ - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* it will become visible, clear notask_error */ wo->notask_error = 0; return 0; @@ -1608,7 +1608,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * own children, it should create a separate process which * takes the role of real parent. */ - if (likely(!ptrace) && task_ptrace(p) && + if (likely(!ptrace) && p->ptrace && same_thread_group(p->parent, p->real_parent)) return 0; diff --git a/kernel/signal.c b/kernel/signal.c index 97e575a3387e..0f3370872506 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1592,7 +1592,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) /* do_notify_parent_cldstop should have been called instead. */ BUG_ON(task_is_stopped_or_traced(tsk)); - BUG_ON(!task_ptrace(tsk) && + BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); info.si_signo = sig; @@ -1631,7 +1631,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); - if (!task_ptrace(tsk) && sig == SIGCHLD && + if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* @@ -1731,7 +1731,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, static inline int may_ptrace_stop(void) { - if (!likely(task_ptrace(current))) + if (!likely(current->ptrace)) return 0; /* * Are we in the middle of do_coredump? @@ -1989,7 +1989,7 @@ static bool do_signal_stop(int signr) if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; else - WARN_ON_ONCE(!task_ptrace(current)); + WARN_ON_ONCE(!current->ptrace); sig->group_stop_count = 0; @@ -2014,7 +2014,7 @@ static bool do_signal_stop(int signr) } } - if (likely(!task_ptrace(current))) { + if (likely(!current->ptrace)) { int notify = 0; /* @@ -2093,7 +2093,7 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info, struct pt_regs *regs, void *cookie) { - if (!task_ptrace(current)) + if (!current->ptrace) return signr; ptrace_signal_deliver(regs, cookie); @@ -2179,7 +2179,7 @@ relock: do_notify_parent_cldstop(current, false, why); leader = current->group_leader; - if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) + if (leader->ptrace && !real_parent_is_ptracer(leader)) do_notify_parent_cldstop(leader, true, why); read_unlock(&tasklist_lock); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca351..b0be989d4365 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * then wait for it to finish before killing * some other task unnecessarily. */ - if (!(task_ptrace(p->group_leader) & - PT_TRACE_EXIT)) + if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) return ERR_PTR(-1UL); } } -- cgit v1.2.3 From a288eecce5253cc1565d400a52b9b476a157e040 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Jun 2011 16:50:37 +0200 Subject: ptrace: kill trivial tracehooks At this point, tracehooks aren't useful to mainline kernel and mostly just add an extra layer of obfuscation. Although they have comments, without actual in-kernel users, it is difficult to tell what are their assumptions and they're actually trying to achieve. To mainline kernel, they just aren't worth keeping around. This patch kills the following trivial tracehooks. * Ones testing whether task is ptraced. Replace with ->ptrace test. tracehook_expect_breakpoints() tracehook_consider_ignored_signal() tracehook_consider_fatal_signal() * ptrace_event() wrappers. Call directly. tracehook_report_exec() tracehook_report_exit() tracehook_report_vfork_done() * ptrace_release_task() wrapper. Call directly. tracehook_finish_release_task() * noop tracehook_prepare_release_task() tracehook_report_death() This doesn't introduce any behavior change. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Cc: Martin Schwidefsky Signed-off-by: Oleg Nesterov --- arch/s390/kernel/traps.c | 4 +- fs/exec.c | 2 +- include/linux/tracehook.h | 156 ---------------------------------------------- kernel/exit.c | 7 +-- kernel/fork.c | 2 +- kernel/signal.c | 8 +-- mm/nommu.c | 3 +- 7 files changed, 11 insertions(+), 171 deletions(-) (limited to 'mm') diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index a65d2e82f61d..a63d34c3611e 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -331,7 +331,7 @@ void __kprobes do_per_trap(struct pt_regs *regs) { if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP) return; - if (tracehook_consider_fatal_signal(current, SIGTRAP)) + if (current->ptrace) force_sig(SIGTRAP, current); } @@ -425,7 +425,7 @@ static void __kprobes illegal_op(struct pt_regs *regs, long pgm_int_code, if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) return; if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { - if (tracehook_consider_fatal_signal(current, SIGTRAP)) + if (current->ptrace) force_sig(SIGTRAP, current); else signal = SIGILL; diff --git a/fs/exec.c b/fs/exec.c index a9f2b3631bdb..b37030d0a50b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1384,7 +1384,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) - tracehook_report_exec(fmt, bprm, regs); + ptrace_event(PTRACE_EVENT_EXEC, 0); put_binfmt(fmt); allow_write_access(bprm->file); if (bprm->file) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3b68aa842a92..8b06d4f2b814 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -51,21 +51,6 @@ #include struct linux_binprm; -/** - * tracehook_expect_breakpoints - guess if task memory might be touched - * @task: current task, making a new mapping - * - * Return nonzero if @task is expected to want breakpoint insertion in - * its memory at some point. A zero return is no guarantee it won't - * be done, but this is a hint that it's known to be likely. - * - * May be called with @task->mm->mmap_sem held for writing. - */ -static inline int tracehook_expect_breakpoints(struct task_struct *task) -{ - return (task->ptrace & PT_PTRACED) != 0; -} - /* * ptrace report for syscall entry and exit looks identical. */ @@ -183,42 +168,6 @@ static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk) return NULL; } -/** - * tracehook_report_exec - a successful exec was completed - * @fmt: &struct linux_binfmt that performed the exec - * @bprm: &struct linux_binprm containing exec details - * @regs: user-mode register state - * - * An exec just completed, we are shortly going to return to user mode. - * The freshly initialized register state can be seen and changed in @regs. - * The name, file and other pointers in @bprm are still on hand to be - * inspected, but will be freed as soon as this returns. - * - * Called with no locks, but with some kernel resources held live - * and a reference on @fmt->module. - */ -static inline void tracehook_report_exec(struct linux_binfmt *fmt, - struct linux_binprm *bprm, - struct pt_regs *regs) -{ - ptrace_event(PTRACE_EVENT_EXEC, 0); -} - -/** - * tracehook_report_exit - task has begun to exit - * @exit_code: pointer to value destined for @current->exit_code - * - * @exit_code points to the value passed to do_exit(), which tracing - * might change here. This is almost the first thing in do_exit(), - * before freeing any resources or setting the %PF_EXITING flag. - * - * Called with no locks held. - */ -static inline void tracehook_report_exit(long *exit_code) -{ - ptrace_event(PTRACE_EVENT_EXIT, *exit_code); -} - /** * tracehook_prepare_clone - prepare for new child to be cloned * @clone_flags: %CLONE_* flags from clone/fork/vfork system call @@ -319,52 +268,6 @@ static inline void tracehook_report_clone_complete(int trace, ptrace_event(trace, pid); } -/** - * tracehook_report_vfork_done - vfork parent's child has exited or exec'd - * @child: child task, already running - * @pid: new child's PID in the parent's namespace - * - * Called after a %CLONE_VFORK parent has waited for the child to complete. - * The clone/vfork system call will return immediately after this. - * The @child pointer may be invalid if a self-reaping child died and - * tracehook_report_clone() took no action to prevent it from self-reaping. - * - * Called with no locks held. - */ -static inline void tracehook_report_vfork_done(struct task_struct *child, - pid_t pid) -{ - ptrace_event(PTRACE_EVENT_VFORK_DONE, pid); -} - -/** - * tracehook_prepare_release_task - task is being reaped, clean up tracing - * @task: task in %EXIT_DEAD state - * - * This is called in release_task() just before @task gets finally reaped - * and freed. This would be the ideal place to remove and clean up any - * tracing-related state for @task. - * - * Called with no locks held. - */ -static inline void tracehook_prepare_release_task(struct task_struct *task) -{ -} - -/** - * tracehook_finish_release_task - final tracing clean-up - * @task: task in %EXIT_DEAD state - * - * This is called in release_task() when @task is being in the middle of - * being reaped. After this, there must be no tracing entanglements. - * - * Called with write_lock_irq(&tasklist_lock) held. - */ -static inline void tracehook_finish_release_task(struct task_struct *task) -{ - ptrace_release_task(task); -} - /** * tracehook_signal_handler - signal handler setup is complete * @sig: number of signal being delivered @@ -388,41 +291,6 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, ptrace_notify(SIGTRAP); } -/** - * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal - * @task: task receiving the signal - * @sig: signal number being sent - * - * Return zero iff tracing doesn't care to examine this ignored signal, - * so it can short-circuit normal delivery and never even get queued. - * - * Called with @task->sighand->siglock held. - */ -static inline int tracehook_consider_ignored_signal(struct task_struct *task, - int sig) -{ - return (task->ptrace & PT_PTRACED) != 0; -} - -/** - * tracehook_consider_fatal_signal - suppress special handling of fatal signal - * @task: task receiving the signal - * @sig: signal number being sent - * - * Return nonzero to prevent special handling of this termination signal. - * Normally handler for signal is %SIG_DFL. It can be %SIG_IGN if @sig is - * ignored, in which case force_sig() is about to reset it to %SIG_DFL. - * When this returns zero, this signal might cause a quick termination - * that does not give the debugger a chance to intercept the signal. - * - * Called with or without @task->sighand->siglock held. - */ -static inline int tracehook_consider_fatal_signal(struct task_struct *task, - int sig) -{ - return (task->ptrace & PT_PTRACED) != 0; -} - #define DEATH_REAP -1 #define DEATH_DELAYED_GROUP_LEADER -2 @@ -457,30 +325,6 @@ static inline int tracehook_notify_death(struct task_struct *task, return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER; } -/** - * tracehook_report_death - task is dead and ready to be reaped - * @task: @current task now exiting - * @signal: return value from tracheook_notify_death() - * @death_cookie: value passed back from tracehook_notify_death() - * @group_dead: nonzero if this was the last thread in the group to die - * - * Thread has just become a zombie or is about to self-reap. If positive, - * @signal is the signal number just sent to the parent (usually %SIGCHLD). - * If @signal is %DEATH_REAP, this thread will self-reap. If @signal is - * %DEATH_DELAYED_GROUP_LEADER, this is a delayed_group_leader() zombie. - * The @death_cookie was passed back by tracehook_notify_death(). - * - * If normal reaping is not inhibited, @task->exit_state might be changing - * in parallel. - * - * Called without locks. - */ -static inline void tracehook_report_death(struct task_struct *task, - int signal, void *death_cookie, - int group_dead) -{ -} - #ifdef TIF_NOTIFY_RESUME /** * set_notify_resume - cause tracehook_notify_resume() to be called diff --git a/kernel/exit.c b/kernel/exit.c index e5cc05644609..d49134a7f250 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -169,7 +169,6 @@ void release_task(struct task_struct * p) struct task_struct *leader; int zap_leader; repeat: - tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -179,7 +178,7 @@ repeat: proc_flush_task(p); write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); + ptrace_release_task(p); __exit_signal(p); /* @@ -868,8 +867,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - tracehook_report_death(tsk, signal, cookie, group_dead); - /* If the process is dead, release it - nobody will wait for it */ if (signal == DEATH_REAP) release_task(tsk); @@ -924,7 +921,7 @@ NORET_TYPE void do_exit(long code) */ set_fs(USER_DS); - tracehook_report_exit(&code); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a0..d4f0dff9d617 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,7 +1527,7 @@ long do_fork(unsigned long clone_flags, freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - tracehook_report_vfork_done(p, nr); + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); diff --git a/kernel/signal.c b/kernel/signal.c index 0f3370872506..1550aee34f42 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) /* * Tracers may want to know about even ignored signals. */ - return !tracehook_consider_ignored_signal(t, sig); + return !t->ptrace; } /* @@ -493,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return 1; if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return !tracehook_consider_fatal_signal(tsk, sig); + /* if ptraced, let the tracer determine */ + return !tsk->ptrace; } /* @@ -981,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || - !tracehook_consider_fatal_signal(t, sig))) { + (sig == SIGKILL || !t->ptrace)) { /* * This signal will be fatal to the whole group. */ diff --git a/mm/nommu.c b/mm/nommu.c index 1fd0c51b10a6..54ae707bdae8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) + if ((flags & MAP_PRIVATE) && current->ptrace) vm_flags &= ~VM_MAYSHARE; return vm_flags; -- cgit v1.2.3 From 7e0528dadc9f8b04e4de0dba48a075100c2afe75 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:44 -0500 Subject: slub: Push irq disable into allocate_slab() Do the irq handling in allocate_slab() instead of __slab_alloc(). __slab_alloc() is already cluttered and allocate_slab() is already fiddling around with gfp flags. v6->v7: Only increment ORDER_FALLBACK if we get a page during fallback Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 35f351f26193..add2ae74046c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1187,6 +1187,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); + flags |= s->allocflags; /* @@ -1203,12 +1208,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(flags, node, oo); - if (!page) - return NULL; - stat(s, ORDER_FALLBACK); + if (page) + stat(s, ORDER_FALLBACK); } + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (!page) + return NULL; + if (kmemcheck_enabled && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); @@ -1849,15 +1859,8 @@ new_slab: goto load_freelist; } - gfpflags &= gfp_allowed_mask; - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - page = new_slab(s, gfpflags, node); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - if (page) { c = __this_cpu_ptr(s->cpu_slab); stat(s, ALLOC_SLAB); -- cgit v1.2.3 From 50d5c41cd151b21ac1dfc98f048210456ccacc20 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:45 -0500 Subject: slub: Do not use frozen page flag but a bit in the page counters Do not use a page flag for the frozen bit. It needs to be part of the state that is handled with cmpxchg_double(). So use a bit in the counter struct in the page struct for that purpose. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/mm_types.h | 5 +++-- include/linux/page-flags.h | 5 ----- mm/slub.c | 12 ++++++------ 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 027935c86c68..e5fb2a70518b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -41,8 +41,9 @@ struct page { * & limit reverse map searches. */ struct { /* SLUB */ - u16 inuse; - u16 objects; + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; }; }; union { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6081493db68f..20791f18f5cf 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -124,9 +124,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - - /* SLUB */ - PG_slub_frozen = PG_active, }; #ifndef __GENERATING_BOUNDS_H @@ -212,8 +209,6 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobFree, slob_free) -__PAGEFLAG(SlubFrozen, slub_frozen) - /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. diff --git a/mm/slub.c b/mm/slub.c index add2ae74046c..82b2d048a278 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -166,7 +166,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ @@ -1025,7 +1025,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } /* Special debug activities for freeing objects */ - if (!PageSlubFrozen(page) && !page->freelist) + if (!page->frozen && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -1424,7 +1424,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, { if (slab_trylock(page)) { __remove_partial(n, page); - __SetPageSlubFrozen(page); + page->frozen = 1; return 1; } return 0; @@ -1538,7 +1538,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - __ClearPageSlubFrozen(page); + page->frozen = 0; if (page->inuse) { if (page->freelist) { @@ -1868,7 +1868,7 @@ new_slab: flush_slab(s, c); slab_lock(page); - __SetPageSlubFrozen(page); + page->frozen = 1; c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -2048,7 +2048,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, page->freelist = object; page->inuse--; - if (unlikely(PageSlubFrozen(page))) { + if (unlikely(page->frozen)) { stat(s, FREE_FROZEN); goto out_unlock; } -- cgit v1.2.3 From 8cb0a5068f4108e8ca60d5e0bcfbe6901adcfaef Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:46 -0500 Subject: slub: Move page->frozen handling near where the page->freelist handling occurs This is necessary because the frozen bit has to be handled in the same cmpxchg_double with the freelist and the counters. Signed-off-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 82b2d048a278..5a2d3d8e0558 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1286,6 +1286,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = 0; + page->frozen = 1; out: return page; } @@ -1424,7 +1425,6 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, { if (slab_trylock(page)) { __remove_partial(n, page); - page->frozen = 1; return 1; } return 0; @@ -1538,7 +1538,6 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - page->frozen = 0; if (page->inuse) { if (page->freelist) { @@ -1671,6 +1670,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) } c->page = NULL; c->tid = next_tid(c->tid); + page->frozen = 0; unfreeze_slab(s, page, tail); } @@ -1831,6 +1831,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: + VM_BUG_ON(!page->frozen); + object = page->freelist; if (unlikely(!object)) goto another_slab; @@ -1854,6 +1856,7 @@ new_slab: page = get_partial(s, gfpflags, node); if (page) { stat(s, ALLOC_FROM_PARTIAL); + page->frozen = 1; c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -2371,6 +2374,7 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; + page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); -- cgit v1.2.3 From b789ef518b2a7231b0668c813f677cee528a9d3f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:49 -0500 Subject: slub: Add cmpxchg_double_slab() Add a function that operates on the second doubleword in the page struct and manipulates the object counters, the freelist and the frozen attribute. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 65 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index c8668d161dd8..b42715294147 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -33,6 +33,7 @@ enum stat_item { DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { diff --git a/mm/slub.c b/mm/slub.c index 5a2d3d8e0558..be6715dd0ee8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -170,6 +173,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ static int kmem_size = sizeof(struct kmem_cache); @@ -338,6 +342,37 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + return 1; + } + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* * Determine a map of object in use on a page. @@ -2596,6 +2631,12 @@ static int kmem_cache_open(struct kmem_cache *s, } } +#ifdef CONFIG_CMPXCHG_DOUBLE + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + /* * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. @@ -4248,8 +4289,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_DEBUG_FREE; + } return length; } SLAB_ATTR(sanity_checks); @@ -4263,8 +4306,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; + } return length; } SLAB_ATTR(trace); @@ -4281,8 +4326,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; + } calculate_sizes(s, -1); return length; } @@ -4300,8 +4347,10 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; + } calculate_sizes(s, -1); return length; } @@ -4319,8 +4368,10 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; + } calculate_sizes(s, -1); return length; } @@ -4493,6 +4544,8 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); #endif static struct attribute *slab_attrs[] = { @@ -4550,6 +4603,8 @@ static struct attribute *slab_attrs[] = { &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- cgit v1.2.3 From 5cc6eee8a8c1aefe9c86fe7345a2aa1c4ca70dc6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:50 -0500 Subject: slub: explicit list_lock taking The allocator fastpath rework does change the usage of the list_lock. Remove the list_lock processing from the functions that hide them from the critical sections and move them into those critical sections. This in turn simplifies the support functions (no __ variant needed anymore) and simplifies the lock handling on bootstrap. Inline add_partial since it becomes pretty simple. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 89 +++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index be6715dd0ee8..e39be0928a22 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -916,26 +916,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Tracking of fully allocated slabs for debugging purposes. + * + * list_lock must be held. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - spin_lock(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) + return; + list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); } +/* + * list_lock must be held. + */ static void remove_full(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n; - if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } /* Tracking of the number of slabs for debugging purposes */ @@ -1060,8 +1061,13 @@ static noinline int free_debug_processing(struct kmem_cache *s, } /* Special debug activities for freeing objects */ - if (!page->frozen && !page->freelist) + if (!page->frozen && !page->freelist) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); remove_full(s, page); + spin_unlock(&n->list_lock); + } if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); @@ -1170,7 +1176,8 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1420,38 +1427,33 @@ static __always_inline int slab_trylock(struct page *page) } /* - * Management of partially allocated slabs + * Management of partially allocated slabs. + * + * list_lock must be held. */ -static void add_partial(struct kmem_cache_node *n, +static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - spin_lock(&n->list_lock); n->nr_partial++; if (tail) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static inline void __remove_partial(struct kmem_cache_node *n, +/* + * list_lock must be held. + */ +static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } -static void remove_partial(struct kmem_cache *s, struct page *page) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - __remove_partial(n, page); - spin_unlock(&n->list_lock); -} - /* - * Lock slab and remove from the partial list. + * Lock slab, remove from the partial list and put the object into the + * per cpu freelist. * * Must hold list_lock. */ @@ -1459,7 +1461,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { - __remove_partial(n, page); + remove_partial(n, page); return 1; } return 0; @@ -1576,12 +1578,17 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) if (page->inuse) { if (page->freelist) { + spin_lock(&n->list_lock); add_partial(n, page, tail); + spin_unlock(&n->list_lock); stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); + if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) { + spin_lock(&n->list_lock); + add_full(s, n, page); + spin_unlock(&n->list_lock); + } } slab_unlock(page); } else { @@ -1597,7 +1604,9 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * kmem_cache_shrink can reclaim any empty slabs from * the partial list. */ + spin_lock(&n->list_lock); add_partial(n, page, 1); + spin_unlock(&n->list_lock); slab_unlock(page); } else { slab_unlock(page); @@ -2099,7 +2108,11 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * then add it. */ if (unlikely(!prior)) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); add_partial(get_node(s, page_to_nid(page)), page, 1); + spin_unlock(&n->list_lock); stat(s, FREE_ADD_PARTIAL); } @@ -2113,7 +2126,11 @@ slab_empty: /* * Slab still on the partial list. */ - remove_partial(s, page); + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + remove_partial(n, page); + spin_unlock(&n->list_lock); stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); @@ -2391,7 +2408,6 @@ static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -2418,14 +2434,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. - */ - local_irq_save(flags); add_partial(n, page, 0); - local_irq_restore(flags); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2709,7 +2718,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, @@ -3047,7 +3056,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * may have freed the last object and be * waiting to release the slab. */ - __remove_partial(n, page); + remove_partial(n, page); slab_unlock(page); discard_slab(s, page); } else { -- cgit v1.2.3 From 61728d1efc927eccfa64c50ede4998a8765805c3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:51 -0500 Subject: slub: Pass kmem_cache struct to lock and freeze slab We need more information about the slab for the cmpxchg implementation. Signed-off-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e39be0928a22..5cf98ff09360 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1457,8 +1457,8 @@ static inline void remove_partial(struct kmem_cache_node *n, * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, - struct page *page) +static inline int lock_and_freeze_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { remove_partial(n, page); @@ -1470,7 +1470,8 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static struct page *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n) { struct page *page; @@ -1485,7 +1486,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) + if (lock_and_freeze_slab(s, n, page)) goto out; page = NULL; out: @@ -1536,7 +1537,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(n); + page = get_partial_node(s, n); if (page) { put_mems_allowed(); return page; @@ -1556,7 +1557,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) struct page *page; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(get_node(s, searchnode)); + page = get_partial_node(s, get_node(s, searchnode)); if (page || node != NUMA_NO_NODE) return page; @@ -2081,7 +2082,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; - unsigned long flags; + unsigned long uninitialized_var(flags); local_irq_save(flags); slab_lock(page); -- cgit v1.2.3 From 2cfb7455d223ab24b23df44be430faf92e12390f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:52 -0500 Subject: slub: Rework allocator fastpaths Rework the allocation paths so that updates of the page freelist, frozen state and number of objects use cmpxchg_double_slab(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 409 ++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 280 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5cf98ff09360..5f0346c97c5f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -992,11 +992,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; @@ -1060,14 +1055,6 @@ static noinline int free_debug_processing(struct kmem_cache *s, goto fail; } - /* Special debug activities for freeing objects */ - if (!page->frozen && !page->freelist) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - remove_full(s, page); - spin_unlock(&n->list_lock); - } if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); @@ -1178,6 +1165,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1460,11 +1448,52 @@ static inline void remove_partial(struct kmem_cache_node *n, static inline int lock_and_freeze_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { - if (slab_trylock(page)) { - remove_partial(n, page); + void *freelist; + unsigned long counters; + struct page new; + + + if (!slab_trylock(page)) + return 0; + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + new.inuse = page->objects; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + } while (!cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "lock and freeze")); + + remove_partial(n, page); + + if (freelist) { + /* Populate the per cpu freelist */ + this_cpu_write(s->cpu_slab->freelist, freelist); + this_cpu_write(s->cpu_slab->page, page); + this_cpu_write(s->cpu_slab->node, page_to_nid(page)); return 1; + } else { + /* + * Slab page came from the wrong list. No object to allocate + * from. Put it onto the correct list and continue partial + * scan. + */ + printk(KERN_ERR "SLUB: %s : Page without available objects on" + " partial list\n", s->name); + slab_unlock(page); + return 0; } - return 0; } /* @@ -1564,59 +1593,6 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) return get_any_partial(s, flags); } -/* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. - */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) - __releases(bitlock) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - if (page->inuse) { - - if (page->freelist) { - spin_lock(&n->list_lock); - add_partial(n, page, tail); - spin_unlock(&n->list_lock); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) { - spin_lock(&n->list_lock); - add_full(s, n, page); - spin_unlock(&n->list_lock); - } - } - slab_unlock(page); - } else { - stat(s, DEACTIVATE_EMPTY); - if (n->nr_partial < s->min_partial) { - /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from - * the partial list. - */ - spin_lock(&n->list_lock); - add_partial(n, page, 1); - spin_unlock(&n->list_lock); - slab_unlock(page); - } else { - slab_unlock(page); - stat(s, FREE_SLAB); - discard_slab(s, page); - } - } -} - #ifdef CONFIG_PREEMPT /* * Calculate the next globally unique transaction for disambiguiation @@ -1683,40 +1659,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } +/* + * Remove the cpu slab + */ + /* * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - __releases(bitlock) { + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct page *page = c->page; - int tail = 1; - - if (page->freelist) + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *freelist; + void *nextfree; + int tail = 0; + struct page new; + struct page old; + + if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); + tail = 1; + } + + c->tid = next_tid(c->tid); + c->page = NULL; + freelist = c->freelist; + c->freelist = NULL; + /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. */ - while (unlikely(c->freelist)) { - void **object; + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } - tail = 0; /* Hot objects. Put the slab first */ + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = get_freepointer(s, c->freelist); + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); - /* And put onto the regular freelist */ - set_freepointer(s, object, page->freelist); - page->freelist = object; - page->inuse--; + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial < s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); + } + } else { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + slab_unlock(page); + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); } - c->page = NULL; - c->tid = next_tid(c->tid); - page->frozen = 0; - unfreeze_slab(s, page, tail); } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -1851,6 +1948,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void **object; struct page *page; unsigned long flags; + struct page new; + unsigned long counters; local_irq_save(flags); #ifdef CONFIG_PREEMPT @@ -1873,25 +1972,33 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!node_match(c, node))) goto another_slab; - stat(s, ALLOC_REFILL); + stat(s, ALLOC_SLOWPATH); + + do { + object = page->freelist; + counters = page->counters; + new.counters = counters; + new.inuse = page->objects; + VM_BUG_ON(!new.frozen); + + } while (!cmpxchg_double_slab(s, page, + object, counters, + NULL, new.counters, + "__slab_alloc")); load_freelist: VM_BUG_ON(!page->frozen); - object = page->freelist; if (unlikely(!object)) goto another_slab; - if (kmem_cache_debug(s)) - goto debug; - c->freelist = get_freepointer(s, object); - page->inuse = page->objects; - page->freelist = NULL; + stat(s, ALLOC_REFILL); slab_unlock(page); + + c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); - stat(s, ALLOC_SLOWPATH); return object; another_slab: @@ -1901,9 +2008,10 @@ new_slab: page = get_partial(s, gfpflags, node); if (page) { stat(s, ALLOC_FROM_PARTIAL); - page->frozen = 1; - c->node = page_to_nid(page); - c->page = page; + object = c->freelist; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } @@ -1911,12 +2019,19 @@ new_slab: if (page) { c = __this_cpu_ptr(s->cpu_slab); - stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + page->inuse = page->objects; + + stat(s, ALLOC_SLAB); slab_lock(page); - page->frozen = 1; c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -1925,12 +2040,12 @@ new_slab: slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; + debug: - if (!alloc_debug_processing(s, page, object, addr)) - goto another_slab; + if (!object || !alloc_debug_processing(s, page, object, addr)) + goto new_slab; - page->inuse++; - page->freelist = get_freepointer(s, object); + c->freelist = get_freepointer(s, object); deactivate_slab(s, c); c->page = NULL; c->node = NUMA_NO_NODE; @@ -2082,6 +2197,11 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; + int was_frozen; + int inuse; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; unsigned long uninitialized_var(flags); local_irq_save(flags); @@ -2091,32 +2211,65 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) goto out_unlock; - prior = page->freelist; - set_freepointer(s, object, prior); - page->freelist = object; - page->inuse--; + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen && !n) { + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock(&n->list_lock); + } + inuse = new.inuse; - if (unlikely(page->frozen)) { - stat(s, FREE_FROZEN); - goto out_unlock; - } + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); - if (unlikely(!page->inuse)) - goto slab_empty; + if (likely(!n)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + goto out_unlock; + } /* - * Objects left in the slab. If it was not on the partial list before - * then add it. + * was_frozen may have been set after we acquired the list_lock in + * an earlier loop. So we need to check it here again. */ - if (unlikely(!prior)) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + if (was_frozen) + stat(s, FREE_FROZEN); + else { + if (unlikely(!inuse && n->nr_partial > s->min_partial)) + goto slab_empty; - spin_lock(&n->list_lock); - add_partial(get_node(s, page_to_nid(page)), page, 1); - spin_unlock(&n->list_lock); - stat(s, FREE_ADD_PARTIAL); + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, 0); + stat(s, FREE_ADD_PARTIAL); + } } + spin_unlock(&n->list_lock); + out_unlock: slab_unlock(page); local_irq_restore(flags); @@ -2127,13 +2280,11 @@ slab_empty: /* * Slab still on the partial list. */ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); remove_partial(n, page); - spin_unlock(&n->list_lock); stat(s, FREE_REMOVE_PARTIAL); } + + spin_unlock(&n->list_lock); slab_unlock(page); local_irq_restore(flags); stat(s, FREE_SLAB); -- cgit v1.2.3 From 881db7fb03a77af0bcd460fd1de1f4062d5c18fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:53 -0500 Subject: slub: Invert locking and avoid slab lock Locking slabs is no longer necesary if the arch supports cmpxchg operations and if no debuggin features are used on a slab. If the arch does not support cmpxchg then we fallback to use the slab lock to do a cmpxchg like operation. The patch also changes the lock order. Slab locks are subsumed to the node lock now. With that approach slab_trylocking is no longer necessary. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 129 +++++++++++++++++++++++++------------------------------------- 1 file changed, 52 insertions(+), 77 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5f0346c97c5f..ee70c091e577 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,10 +2,11 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include @@ -32,15 +33,27 @@ /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slub_lock (Global Semaphore) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * slub_lock + * + * The role of the slub_lock is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -53,20 +66,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has no one operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -342,6 +341,19 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, @@ -356,11 +368,14 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, } else #endif { + slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; + slab_unlock(page); return 1; } + slab_unlock(page); } cpu_relax(); @@ -377,7 +392,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, /* * Determine a map of object in use on a page. * - * Slab lock or node listlock must be held to guarantee that the page does + * Node listlock must be held to guarantee that the page does * not vanish from under us. */ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -808,10 +823,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; unsigned long max_objects; + fp = page->freelist; while (fp && nr <= page->objects) { if (fp == search) return 1; @@ -1024,6 +1040,8 @@ bad: static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -1059,10 +1077,12 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); + slab_unlock(page); return 1; fail: slab_fix(s, "Object at 0x%p not freed", object); + slab_unlock(page); return 0; } @@ -1393,27 +1413,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page) free_slab(s, page); } -/* - * Per slab locking using the pagelock - */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - /* * Management of partially allocated slabs. * @@ -1445,17 +1444,13 @@ static inline void remove_partial(struct kmem_cache_node *n, * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache *s, +static inline int acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { void *freelist; unsigned long counters; struct page new; - - if (!slab_trylock(page)) - return 0; - /* * Zap the freelist and set the frozen bit. * The old freelist is the list of objects for the @@ -1491,7 +1486,6 @@ static inline int lock_and_freeze_slab(struct kmem_cache *s, */ printk(KERN_ERR "SLUB: %s : Page without available objects on" " partial list\n", s->name); - slab_unlock(page); return 0; } } @@ -1515,7 +1509,7 @@ static struct page *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(s, n, page)) + if (acquire_slab(s, n, page)) goto out; page = NULL; out: @@ -1804,8 +1798,6 @@ redo: "unfreezing slab")) goto redo; - slab_unlock(page); - if (lock) spin_unlock(&n->list_lock); @@ -1819,7 +1811,6 @@ redo: static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - slab_lock(c->page); deactivate_slab(s, c); } @@ -1968,7 +1959,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!page) goto new_slab; - slab_lock(page); if (unlikely(!node_match(c, node))) goto another_slab; @@ -1994,8 +1984,6 @@ load_freelist: stat(s, ALLOC_REFILL); - slab_unlock(page); - c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); @@ -2031,7 +2019,6 @@ new_slab: page->inuse = page->objects; stat(s, ALLOC_SLAB); - slab_lock(page); c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -2205,7 +2192,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, unsigned long uninitialized_var(flags); local_irq_save(flags); - slab_lock(page); stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) @@ -2271,7 +2257,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, spin_unlock(&n->list_lock); out_unlock: - slab_unlock(page); local_irq_restore(flags); return; @@ -2285,7 +2270,6 @@ slab_empty: } spin_unlock(&n->list_lock); - slab_unlock(page); local_irq_restore(flags); stat(s, FREE_SLAB); discard_slab(s, page); @@ -3202,14 +3186,8 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ + if (!page->inuse) { remove_partial(n, page); - slab_unlock(page); discard_slab(s, page); } else { list_move(&page->lru, @@ -3797,12 +3775,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, -- cgit v1.2.3 From 5c2e4bbbd60623f1024a753c291b666068f8a6e7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:54 -0500 Subject: slub: Disable interrupts in free_debug processing We will be calling free_debug_processing with interrupts disabled in some case when the later patches are applied. Some of the functions called by free_debug_processing expect interrupts to be off. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ee70c091e577..08c57a047548 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1040,6 +1040,10 @@ bad: static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + unsigned long flags; + int rc = 0; + + local_irq_save(flags); slab_lock(page); if (!check_slab(s, page)) @@ -1056,7 +1060,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - return 0; + goto out; if (unlikely(s != page->slab)) { if (!PageSlab(page)) { @@ -1077,13 +1081,15 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); + rc = 1; +out: slab_unlock(page); - return 1; + local_irq_restore(flags); + return rc; fail: slab_fix(s, "Object at 0x%p not freed", object); - slab_unlock(page); - return 0; + goto out; } static int __init setup_slub_debug(char *str) -- cgit v1.2.3 From 80f08c191f6c9563641291bea80657a3b9faabf0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:55 -0500 Subject: slub: Avoid disabling interrupts in free slowpath Disabling interrupts can be avoided now. However, list operation still require disabling interrupts since allocations can occur from interrupt contexts and there is no way to perform atomic list operations. The acquition of the list_lock therefore has to disable interrupts as well. Dropping interrupt handling significantly simplifies the slowpath. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 08c57a047548..cb6b0857e1a6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2197,11 +2197,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, struct kmem_cache_node *n = NULL; unsigned long uninitialized_var(flags); - local_irq_save(flags); stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) - goto out_unlock; + return; do { prior = page->freelist; @@ -2220,7 +2219,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } inuse = new.inuse; @@ -2236,7 +2235,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (was_frozen) stat(s, FREE_FROZEN); - goto out_unlock; + return; } /* @@ -2259,11 +2258,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_ADD_PARTIAL); } } - - spin_unlock(&n->list_lock); - -out_unlock: - local_irq_restore(flags); + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: @@ -2275,8 +2270,7 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } - spin_unlock(&n->list_lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } -- cgit v1.2.3 From fc59c05306fe1dcfa3fb8ba34ed45407fba4689c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:56 -0500 Subject: slub: Get rid of the another_slab label We can avoid deactivate slab in special cases if we do the deactivation of slabs in each code flow that leads to new_slab. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index cb6b0857e1a6..41a15c1d8068 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1965,8 +1965,10 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!page) goto new_slab; - if (unlikely(!node_match(c, node))) - goto another_slab; + if (unlikely(!node_match(c, node))) { + deactivate_slab(s, c); + goto new_slab; + } stat(s, ALLOC_SLOWPATH); @@ -1986,7 +1988,7 @@ load_freelist: VM_BUG_ON(!page->frozen); if (unlikely(!object)) - goto another_slab; + goto new_slab; stat(s, ALLOC_REFILL); @@ -1995,9 +1997,6 @@ load_freelist: local_irq_restore(flags); return object; -another_slab: - deactivate_slab(s, c); - new_slab: page = get_partial(s, gfpflags, node); if (page) { -- cgit v1.2.3 From e36a2652d7d1ad97f7636a39bdd8654d296cc36b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:57 -0500 Subject: slub: Add statistics for the case that the current slab does not match the node Slub reloads the per cpu slab if the page does not satisfy the NUMA condition. Track those reloads since doing so has a performance impact. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index b42715294147..5b228b785377 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -24,6 +24,7 @@ enum stat_item { ALLOC_FROM_PARTIAL, /* Cpu slab acquired from partial list */ ALLOC_SLAB, /* Cpu slab acquired from page allocator */ ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ FREE_SLAB, /* Slab freed to the page allocator */ CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ diff --git a/mm/slub.c b/mm/slub.c index 41a15c1d8068..e00b7732f556 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1966,6 +1966,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto new_slab; if (unlikely(!node_match(c, node))) { + stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, c); goto new_slab; } @@ -4671,6 +4672,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4730,6 +4732,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, -- cgit v1.2.3 From 03e404af26dc2ea0d278d7a342de0aab394793ce Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:58 -0500 Subject: slub: fast release on full slab Make deactivation occur implicitly while checking out the current freelist. This avoids one cmpxchg operation on a slab that is now fully in use. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5b228b785377..71441f89729b 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -32,6 +32,7 @@ enum stat_item { DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ diff --git a/mm/slub.c b/mm/slub.c index e00b7732f556..25dac48c1c60 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1977,9 +1977,21 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, object = page->freelist; counters = page->counters; new.counters = counters; - new.inuse = page->objects; VM_BUG_ON(!new.frozen); + /* + * If there is no object left then we use this loop to + * deactivate the slab which is simple since no objects + * are left in the slab and therefore we do not need to + * put the page back onto the partial list. + * + * If there are objects left then we retrieve them + * and use them to refill the per cpu queue. + */ + + new.inuse = page->objects; + new.frozen = object != NULL; + } while (!cmpxchg_double_slab(s, page, object, counters, NULL, new.counters, @@ -1988,8 +2000,11 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, load_freelist: VM_BUG_ON(!page->frozen); - if (unlikely(!object)) + if (unlikely(!object)) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); goto new_slab; + } stat(s, ALLOC_REFILL); @@ -4680,6 +4695,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); @@ -4740,6 +4756,7 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, &cmpxchg_double_cpu_fail_attr.attr, -- cgit v1.2.3 From 4eade540fc35353813097bfdb39465c9b8847a15 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:59 -0500 Subject: slub: Not necessary to check for empty slab on load_freelist load_freelist is now only branched to only if there are objects available. So no need to check the object variable for NULL. Signed-off-by: Pekka Enberg --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 25dac48c1c60..78c488202f7d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1997,9 +1997,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, NULL, new.counters, "__slab_alloc")); -load_freelist: - VM_BUG_ON(!page->frozen); - if (unlikely(!object)) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); @@ -2008,6 +2005,8 @@ load_freelist: stat(s, ALLOC_REFILL); +load_freelist: + VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); -- cgit v1.2.3 From d6543e3935cec9f66b9647c24c2e44c68f8a91fd Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 7 Jul 2011 11:36:36 -0700 Subject: slub: Enable backtrace for create/delete points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch attempts to grab a backtrace for the creation and deletion points of the slub object. When a fault is detected, we can then get a better idea of where the item was deleted. Example output from debugging some funky nfs/rpc behaviour: ============================================================================= BUG kmalloc-64: Object is on free-list ----------------------------------------------------------------------------- INFO: Allocated in rpcb_getport_async+0x39c/0x5a5 [sunrpc] age=381 cpu=3 pid=3750 __slab_alloc+0x348/0x3ba kmem_cache_alloc_trace+0x67/0xe7 rpcb_getport_async+0x39c/0x5a5 [sunrpc] call_bind+0x70/0x75 [sunrpc] __rpc_execute+0x78/0x24b [sunrpc] rpc_execute+0x3d/0x42 [sunrpc] rpc_run_task+0x79/0x81 [sunrpc] rpc_call_sync+0x3f/0x60 [sunrpc] rpc_ping+0x42/0x58 [sunrpc] rpc_create+0x4aa/0x527 [sunrpc] nfs_create_rpc_client+0xb1/0xf6 [nfs] nfs_init_client+0x3b/0x7d [nfs] nfs_get_client+0x453/0x5ab [nfs] nfs_create_server+0x10b/0x437 [nfs] nfs_fs_mount+0x4ca/0x708 [nfs] mount_fs+0x6b/0x152 INFO: Freed in rpcb_map_release+0x3f/0x44 [sunrpc] age=30 cpu=2 pid=29049 __slab_free+0x57/0x150 kfree+0x107/0x13a rpcb_map_release+0x3f/0x44 [sunrpc] rpc_release_calldata+0x12/0x14 [sunrpc] rpc_free_task+0x59/0x61 [sunrpc] rpc_final_put_task+0x82/0x8a [sunrpc] __rpc_execute+0x23c/0x24b [sunrpc] rpc_async_schedule+0x10/0x12 [sunrpc] process_one_work+0x230/0x41d worker_thread+0x133/0x217 kthread+0x7d/0x85 kernel_thread_helper+0x4/0x10 INFO: Slab 0xffffea00029aa470 objects=20 used=9 fp=0xffff8800be7830d8 flags=0x20000000004081 INFO: Object 0xffff8800be7830d8 @offset=4312 fp=0xffff8800be7827a8 Bytes b4 0xffff8800be7830c8: 87 a8 96 00 01 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a .�......ZZZZZZZZ Object 0xffff8800be7830d8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk Object 0xffff8800be7830e8: 6b 6b 6b 6b 01 08 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkk..kkkkkkkkkk Object 0xffff8800be7830f8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk Object 0xffff8800be783108: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk� Redzone 0xffff8800be783118: bb bb bb bb bb bb bb bb ������������� Padding 0xffff8800be783258: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ Pid: 29049, comm: kworker/2:2 Not tainted 3.0.0-rc4+ #8 Call Trace: [] print_trailer+0x131/0x13a [] object_err+0x35/0x3e [] verify_mem_not_deleted+0x7a/0xb7 [] rpcb_getport_done+0x23/0x126 [sunrpc] [] rpc_exit_task+0x3f/0x6d [sunrpc] [] __rpc_execute+0x78/0x24b [sunrpc] [] ? rpc_execute+0x42/0x42 [sunrpc] [] rpc_async_schedule+0x10/0x12 [sunrpc] [] process_one_work+0x230/0x41d [] ? process_one_work+0x17b/0x41d [] worker_thread+0x133/0x217 [] ? manage_workers+0x191/0x191 [] kthread+0x7d/0x85 [] kernel_thread_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? __init_kthread_worker+0x56/0x56 [] ? gs_change+0x13/0x13 Acked-by: Christoph Lameter Signed-off-by: Ben Greear Signed-off-by: Pekka Enberg --- mm/slub.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7be0223531b0..c9050995bc87 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -191,8 +191,12 @@ static LIST_HEAD(slab_caches); /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -420,6 +424,24 @@ static void set_track(struct kmem_cache *s, void *object, struct track *p = get_track(s, object, alloc); if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; @@ -444,6 +466,16 @@ static void print_track(const char *s, struct track *t) printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) -- cgit v1.2.3 From d18a90dd85f8243ed20cdadb6d8a37d595df456d Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 7 Jul 2011 11:36:37 -0700 Subject: slub: Add method to verify memory is not freed This is for tracking down suspect memory usage. Acked-by: Christoph Lameter Signed-off-by: Ben Greear Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 13 +++++++++++++ mm/slub.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index fd4fdc72bc8c..4b35c06dfbc5 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -218,6 +218,19 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order) return ret; } +/** + * Calling this on allocated memory will check that the memory + * is expected to be in use, and print warnings if not. + */ +#ifdef CONFIG_SLUB_DEBUG +extern bool verify_mem_not_deleted(const void *x); +#else +static inline bool verify_mem_not_deleted(const void *x) +{ + return true; +} +#endif + #ifdef CONFIG_TRACING extern void * kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size); diff --git a/mm/slub.c b/mm/slub.c index c9050995bc87..0e4f4f8245bc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2964,6 +2964,42 @@ size_t ksize(const void *object) } EXPORT_SYMBOL(ksize); +#ifdef CONFIG_SLUB_DEBUG +bool verify_mem_not_deleted(const void *x) +{ + struct page *page; + void *object = (void *)x; + unsigned long flags; + bool rv; + + if (unlikely(ZERO_OR_NULL_PTR(x))) + return false; + + local_irq_save(flags); + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + /* maybe it was from stack? */ + rv = true; + goto out_unlock; + } + + slab_lock(page); + if (on_freelist(page->slab, page, object)) { + object_err(page->slab, page, object, "Object is on free-list"); + rv = false; + } else { + rv = true; + } + slab_unlock(page); + +out_unlock: + local_irq_restore(flags); + return rv; +} +EXPORT_SYMBOL(verify_mem_not_deleted); +#endif + void kfree(const void *x) { struct page *page; -- cgit v1.2.3 From c4089f98e943ff445665dea49c190657b34ccffe Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 26 Jun 2011 21:39:18 +0200 Subject: slub: reduce overhead of slub_debug slub checks for poison one byte by one, which is highly inefficient and shows up frequently as a highest cpu-eater in perf top. Joining reads gives nice speedup: (Compiling some project with different options) make -j12 make clean slub_debug disabled: 1m 27s 1.2 s slub_debug enabled: 1m 46s 7.6 s slub_debug enabled + this patch: 1m 33s 3.2 s check_bytes still shows up high, but not always at the top. Signed-off-by: Marcin Slusarz Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Cc: linux-mm@kvack.org Signed-off-by: Pekka Enberg --- mm/slub.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 0e4f4f8245bc..e3403b30159e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -589,10 +589,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) { while (bytes) { - if (*start != (u8)value) + if (*start != value) return start; start++; bytes--; @@ -600,6 +600,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) return NULL; } +static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) +{ + u64 value64; + unsigned int words, prefix; + + if (bytes <= 16) + return check_bytes8(start, value, bytes); + + value64 = value | value << 8 | value << 16 | value << 24; + value64 = value64 | value64 << 32; + prefix = 8 - ((unsigned long)start) % 8; + + if (prefix) { + u8 *r = check_bytes8(start, value, prefix); + if (r) + return r; + start += prefix; + bytes -= prefix; + } + + words = bytes / 8; + + while (words) { + if (*(u64 *)start != value64) + return check_bytes8(start, value, 8); + start += 8; + words--; + } + + return check_bytes8(start, value, bytes % 8); +} + static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { -- cgit v1.2.3 From bfa71457a091ac0e4e20cab36e8ebad63935e504 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 7 Jul 2011 22:47:01 +0300 Subject: SLUB: Fix missing include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following build breakage commit d6543e3 ("slub: Enable backtrace for create/delete points"): CC mm/slub.o mm/slub.c: In function ‘set_track’: mm/slub.c:428: error: storage size of ‘trace’ isn’t known mm/slub.c:435: error: implicit declaration of function ‘save_stack_trace’ mm/slub.c:428: warning: unused variable ‘trace’ make[1]: *** [mm/slub.o] Error 1 make: *** [mm/slub.o] Error 2 Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e3403b30159e..f899ff469f60 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include #include #include +#include #include -- cgit v1.2.3 From d46db3d58233be4be980eb1e42eebe7808bcabab Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 4 May 2011 19:54:37 -0600 Subject: writeback: make writeback_control.nr_to_write straight Pass struct wb_writeback_work all the way down to writeback_sb_inodes(), and initialize the struct writeback_control there. struct writeback_control is basically designed to control writeback of a single file, but we keep abuse it for writing multiple files in writeback_sb_inodes() and its callers. It immediately clean things up, e.g. suddenly wbc.nr_to_write vs work->nr_pages starts to make sense, and instead of saving and restoring pages_skipped in writeback_sb_inodes it can always start with a clean zero value. It also makes a neat IO pattern change: large dirty files are now written in the full 4MB writeback chunk size, rather than whatever remained quota in wbc->nr_to_write. Acked-by: Jan Kara Proposed-by: Christoph Hellwig Signed-off-by: Wu Fengguang --- fs/btrfs/extent_io.c | 2 - fs/fs-writeback.c | 196 ++++++++++++++++++++++----------------- include/linux/writeback.h | 6 +- include/trace/events/writeback.h | 39 +++++--- mm/backing-dev.c | 17 +--- mm/page-writeback.c | 17 +--- 6 files changed, 148 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11c1efd..561262d35689 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2551,7 +2551,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, }; struct writeback_control wbc_writepages = { .sync_mode = wbc->sync_mode, - .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, @@ -2584,7 +2583,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6caa98247a5b..2c947da39f6e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -29,12 +29,22 @@ #include #include "internal.h" +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024L + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -472,7 +482,6 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * No need to add it back to the LRU. */ list_del_init(&inode->i_wb_list); - wbc->inodes_written++; } } inode_sync_complete(inode); @@ -506,6 +515,31 @@ static bool pin_sb_for_writeback(struct super_block *sb) return false; } +static long writeback_chunk_size(struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else + pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -513,18 +547,30 @@ static bool pin_sb_for_writeback(struct super_block *sb) * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging @@ -539,7 +585,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -553,12 +599,18 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, requeue_io(inode, wb); continue; } - __iget(inode); + write_chunk = writeback_chunk_size(work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + writeback_single_inode(inode, wb, &wbc); - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wb, wbc); - if (wbc->pages_skipped != pages_skipped) { + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. @@ -570,17 +622,25 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, iput(inode); cond_resched(); spin_lock(&wb->list_lock); - if (wbc->nr_to_write <= 0) - return 1; + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } - /* b_io is empty */ - return 1; + return wrote; } -static void __writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -590,33 +650,37 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb, requeue_io(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } /* Leave any unwritten inodes on b_io */ + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + }; + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - __writeback_inodes_wb(wb, wbc); + queue_io(wb, NULL); + __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); -} -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} static inline bool over_bground_thresh(void) { @@ -646,42 +710,13 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .tagged_writepages = work->tagged_writepages, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; - - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * writeback_sb_inodes() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) - write_chunk = LONG_MAX; + long progress; oldest_jif = jiffies; - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; spin_lock(&wb->list_lock); for (;;) { @@ -711,24 +746,17 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; } - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; - wbc.inodes_written = 0; - - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, wbc.older_than_this); + queue_io(wb, work->older_than_this); if (work->sb) - writeback_sb_inodes(work->sb, wb, &wbc, true); + progress = writeback_sb_inodes(work->sb, wb, work); else - __writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); - - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); /* * Did we write something? Try for more @@ -738,9 +766,7 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write < write_chunk) - continue; - if (wbc.inodes_written) + if (progress) continue; /* * No more inodes for IO, bail @@ -753,8 +779,8 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); @@ -762,7 +788,7 @@ static long wb_writeback(struct bdi_writeback *wb, } spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2f1b512bd6e0..df1b7f18f100 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -24,12 +24,9 @@ enum writeback_sync_modes { */ struct writeback_control { enum writeback_sync_modes sync_mode; - unsigned long *older_than_this; /* If !NULL, only write back inodes - older than this */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ - long inodes_written; /* # of inodes written (at least) */ /* * For a_ops->writepages(): is start or end are non-zero then this is @@ -56,8 +53,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); int writeback_inodes_sb_if_idle(struct super_block *); int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); void sync_inodes_sb(struct super_block *); -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 205d14919ef2..3e7662a0cfa3 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -62,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, \ DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); +DEFINE_WRITEBACK_WORK_EVENT(writeback_start); +DEFINE_WRITEBACK_WORK_EVENT(writeback_written); +DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), @@ -101,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); +DEFINE_WRITEBACK_EVENT(balance_dirty_start); +DEFINE_WRITEBACK_EVENT(balance_dirty_wait); + +TRACE_EVENT(balance_dirty_written, + + TP_PROTO(struct backing_dev_info *bdi, int written), + + TP_ARGS(bdi, written), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(int, written) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->written = written; + ), + + TP_printk("bdi %s written %d", + __entry->name, + __entry->written + ) +); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -114,7 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) - __field(unsigned long, older_than_this) __field(long, range_start) __field(long, range_end) ), @@ -128,14 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; - __entry->older_than_this = wbc->older_than_this ? - *wbc->older_than_this : 0; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " - "bgrd=%d reclm=%d cyclic=%d older=0x%lx " + "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx", __entry->name, __entry->nr_to_write, @@ -145,7 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, - __entry->older_than_this, __entry->range_start, __entry->range_end) ) @@ -154,12 +177,6 @@ DECLARE_EVENT_CLASS(wbc_class, DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) -DEFINE_WBC_EVENT(wbc_writeback_start); -DEFINE_WBC_EVENT(wbc_writeback_written); -DEFINE_WBC_EVENT(wbc_writeback_wait); -DEFINE_WBC_EVENT(wbc_balance_dirty_start); -DEFINE_WBC_EVENT(wbc_balance_dirty_written); -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5f6553ef1ba7..7ba303be5e03 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -260,18 +260,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void bdi_flush_io(struct backing_dev_info *bdi) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .range_cyclic = 1, - .nr_to_write = 1024, - }; - - writeback_inodes_wb(&bdi->wb, &wbc); -} - /* * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be @@ -457,9 +445,10 @@ static int bdi_forker_thread(void *ptr) if (IS_ERR(task)) { /* * If thread creation fails, force writeout of - * the bdi from the thread. + * the bdi from the thread. Hopefully 1024 is + * large enough for efficient IO. */ - bdi_flush_io(bdi); + writeback_inodes_wb(&bdi->wb, 1024); } else { /* * The spinlock makes sure we do not lose diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1965d05a29cc..9d6ac2b6d942 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -491,13 +491,6 @@ static void balance_dirty_pages(struct address_space *mapping, struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); @@ -559,17 +552,17 @@ static void balance_dirty_pages(struct address_space *mapping, * threshold otherwise wait until the disk writes catch * up. */ - trace_wbc_balance_dirty_start(&wbc, bdi); + trace_balance_dirty_start(bdi); if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - trace_wbc_balance_dirty_written(&wbc, bdi); + pages_written += writeback_inodes_wb(&bdi->wb, + write_chunk); + trace_balance_dirty_written(bdi, pages_written); if (pages_written >= write_chunk) break; /* We've done our duty */ } - trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); + trace_balance_dirty_wait(bdi); /* * Increase the delay for each loop, up to our previous -- cgit v1.2.3 From f7d2b1ecd0c714adefc7d3a942ef87beb828a763 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Dec 2010 22:44:24 -0600 Subject: writeback: account per-bdi accumulated written pages Introduce the BDI_WRITTEN counter. It will be used for estimating the bdi's write bandwidth. Peter Zijlstra : Move BDI_WRITTEN accounting into __bdi_writeout_inc(). This will cover and fix fuse, which only calls bdi_writeout_inc(). CC: Michael Rubin Reviewed-by: KOSAKI Motohiro Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang --- include/linux/backing-dev.h | 1 + mm/backing-dev.c | 10 ++++++++-- mm/page-writeback.c | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 47feb2c4706a..469d56443c63 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -40,6 +40,7 @@ typedef int (congested_fn)(void *, int); enum bdi_stat_item { BDI_RECLAIMABLE, BDI_WRITEBACK, + BDI_WRITTEN, NR_BDI_STAT_ITEMS }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7ba303be5e03..83f18a1d9d10 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" + "BdiWritten: %8lu kB\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" @@ -104,8 +105,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "state: %8lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_dirty, nr_io, nr_more_io, + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh), + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + nr_dirty, + nr_io, + nr_more_io, !list_empty(&bdi->bdi_list), bdi->state); #undef K diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9d6ac2b6d942..8cd71376c63d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -219,6 +219,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, */ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { + __inc_bdi_stat(bdi, BDI_WRITTEN); __prop_inc_percpu_max(&vm_completions, &bdi->completions, bdi->max_prop_frac); } -- cgit v1.2.3 From e98be2d599207c6b31e9bb340d52a231b2f3662d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 11:22:30 -0600 Subject: writeback: bdi write bandwidth estimation The estimation value will start from 100MB/s and adapt to the real bandwidth in seconds. It tries to update the bandwidth only when disk is fully utilized. Any inactive period of more than one second will be skipped. The estimated bandwidth will be reflecting how fast the device can writeout when _fully utilized_, and won't drop to 0 when it goes idle. The value will remain constant at disk idle time. At busy write time, if not considering fluctuations, it will also remain high unless be knocked down by possible concurrent reads that compete for the disk time and bandwidth with async writes. The estimation is not done purely in the flusher because there is no guarantee for write_cache_pages() to return timely to update bandwidth. The bdi->avg_write_bandwidth smoothing is very effective for filtering out sudden spikes, however may be a little biased in long term. The overheads are low because the bdi bandwidth update only occurs at 200ms intervals. The 200ms update interval is suitable, because it's not possible to get the real bandwidth for the instance at all, due to large fluctuations. The NFS commits can be as large as seconds worth of data. One XFS completion may be as large as half second worth of data if we are going to increase the write chunk to half second worth of data. In ext4, fluctuations with time period of around 5 seconds is observed. And there is another pattern of irregular periods of up to 20 seconds on SSD tests. That's why we are not only doing the estimation at 200ms intervals, but also averaging them over a period of 3 seconds and then go further to do another level of smoothing in avg_write_bandwidth. CC: Li Shaohua CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 13 +++++++ include/linux/backing-dev.h | 5 +++ include/linux/writeback.h | 3 ++ mm/backing-dev.c | 12 +++++++ mm/page-writeback.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 120 insertions(+) (limited to 'mm') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2c947da39f6e..5826992910e9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -692,6 +692,16 @@ static inline bool over_bground_thresh(void) global_page_state(NR_UNSTABLE_NFS) > background_thresh); } +/* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, start_time); +} + /* * Explicit flushing or periodic writeback of "old" data. * @@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { + unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long oldest_jif; struct inode *inode; @@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb->bdi, work); + wb_update_bandwidth(wb, wb_start); + /* * Did we write something? Try for more * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 469d56443c63..a008982e7c08 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -73,6 +73,11 @@ struct backing_dev_info { struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + struct prop_local_percpu completions; int dirty_exceeded; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index df1b7f18f100..66862f2d90c8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time); + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 83f18a1d9d10..a76cdd160277 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -638,6 +638,11 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -660,6 +665,13 @@ int bdi_init(struct backing_dev_info *bdi) } bdi->dirty_exceeded = 0; + + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + err = prop_local_init_percpu(&bdi->completions); if (err) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8cd71376c63d..446bdf7b975b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,6 +36,11 @@ #include #include +/* + * Estimate write bandwidth at 200ms intervals. + */ +#define BANDWIDTH_INTERVAL max(HZ/5, 1) + /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. @@ -471,6 +476,85 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + u64 bw; + + /* + * bw = written * HZ / elapsed + * + * bw * elapsed + write_bandwidth * (period - elapsed) + * write_bandwidth = --------------------------------------------------- + * period + */ + bw = written - bdi->written_stamp; + bw *= HZ; + if (unlikely(elapsed > period)) { + do_div(bw, elapsed); + avg = bw; + goto out; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw >>= ilog2(period); + + /* + * one more level of smoothing, for filtering out sudden spikes + */ + if (avg > old && old >= (unsigned long)bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= (unsigned long)bw) + avg += (old - avg) >> 3; + +out: + bdi->write_bandwidth = bw; + bdi->avg_write_bandwidth = avg; +} + +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long written; + + /* + * rate-limit, only update once every 200ms. + */ + if (elapsed < BANDWIDTH_INTERVAL) + return; + + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* + * Skip quiet periods when disk bandwidth is under-utilized. + * (at least 1s idle time between two flusher runs) + */ + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + goto snapshot; + + bdi_update_write_bandwidth(bdi, elapsed, written); + +snapshot: + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +} + +static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) + return; + spin_lock(&bdi->wb.list_lock); + __bdi_update_bandwidth(bdi, start_time); + spin_unlock(&bdi->wb.list_lock); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -490,6 +574,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pause = 1; bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long start_time = jiffies; for (;;) { nr_reclaimable = global_page_state(NR_FILE_DIRTY) + @@ -544,6 +629,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; + bdi_update_bandwidth(bdi, start_time); + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been -- cgit v1.2.3 From 00821b002df7da867bb2c15b4f83f3706371383f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 11:28:45 -0600 Subject: writeback: show bdi write bandwidth in debugfs Add a "BdiWriteBandwidth" entry and indent others in /debug/bdi/*/stats. btw, increase digital field width to 10, for keeping the possibly huge BdiWritten number aligned at least for desktop systems. Impact: this could break user space tools if they are dumb enough to depend on the number of white spaces. CC: Theodore Ts'o CC: Jan Kara CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- mm/backing-dev.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a76cdd160277..ddd0345e2e6d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -92,23 +92,25 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, - "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" - "BdiDirtyThresh: %8lu kB\n" - "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n" - "BdiWritten: %8lu kB\n" - "b_dirty: %8lu\n" - "b_io: %8lu\n" - "b_more_io: %8lu\n" - "bdi_list: %8u\n" - "state: %8lx\n", + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), K(bdi_thresh), K(dirty_thresh), K(background_thresh), (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi->write_bandwidth), nr_dirty, nr_io, nr_more_io, -- cgit v1.2.3 From 7762741e3af69720186802e945229b6a5afd5c49 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 12 Sep 2010 13:34:05 -0600 Subject: writeback: consolidate variable names in balance_dirty_pages() Introduce nr_dirty = NR_FILE_DIRTY + NR_WRITEBACK + NR_UNSTABLE_NFS in order to simplify many tests in the following patches. balance_dirty_pages() will eventually care only about the dirty sums besides nr_writeback. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 446bdf7b975b..5f3e1b46ace5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -565,8 +565,9 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, static void balance_dirty_pages(struct address_space *mapping, unsigned long write_chunk) { - long nr_reclaimable, bdi_nr_reclaimable; - long nr_writeback, bdi_nr_writeback; + unsigned long nr_reclaimable, bdi_nr_reclaimable; + unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ + unsigned long bdi_dirty; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; @@ -579,7 +580,7 @@ static void balance_dirty_pages(struct address_space *mapping, for (;;) { nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_writeback = global_page_state(NR_WRITEBACK); + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -588,8 +589,7 @@ static void balance_dirty_pages(struct address_space *mapping, * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback <= - (background_thresh + dirty_thresh) / 2) + if (nr_dirty <= (background_thresh + dirty_thresh) / 2) break; bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -607,10 +607,12 @@ static void balance_dirty_pages(struct address_space *mapping, */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + bdi_dirty = bdi_nr_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + bdi_dirty = bdi_nr_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); } /* @@ -619,9 +621,8 @@ static void balance_dirty_pages(struct address_space *mapping, * bdi or process from holding back light ones; The latter is * the last resort safeguard. */ - dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) - || (nr_reclaimable + nr_writeback > dirty_thresh); + dirty_exceeded = (bdi_dirty > bdi_thresh) || + (nr_dirty > dirty_thresh); if (!dirty_exceeded) break; -- cgit v1.2.3 From c42843f2f0bbc9d716a32caf667d18fc2bf3bc4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 2 Mar 2011 15:54:09 -0600 Subject: writeback: introduce smoothed global dirty limit The start of a heavy weight application (ie. KVM) may instantly knock down determine_dirtyable_memory() if the swap is not enabled or full. global_dirty_limits() and bdi_dirty_limit() will in turn get global/bdi dirty thresholds that are _much_ lower than the global/bdi dirty pages. balance_dirty_pages() will then heavily throttle all dirtiers including the light ones, until the dirty pages drop below the new dirty thresholds. During this _deep_ dirty-exceeded state, the system may appear rather unresponsive to the users. About "deep" dirty-exceeded: task_dirty_limit() assigns 1/8 lower dirty threshold to heavy dirtiers than light ones, and the dirty pages will be throttled around the heavy dirtiers' dirty threshold and reasonably below the light dirtiers' dirty threshold. In this state, only the heavy dirtiers will be throttled and the dirty pages are carefully controlled to not exceed the light dirtiers' dirty threshold. However if the threshold itself suddenly drops below the number of dirty pages, the light dirtiers will get heavily throttled. So introduce global_dirty_limit for tracking the global dirty threshold with policies - follow downwards slowly - follow up in one shot global_dirty_limit can effectively mask out the impact of sudden drop of dirtyable memory. It will be used in the next patch for two new type of dirty limits. Note that the new dirty limits are not going to avoid throttling the light dirtiers, but could limit their sleep time to 200ms. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 6 ++++ mm/page-writeback.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5826992910e9..227ff12257f3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -699,7 +699,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 66862f2d90c8..e9d371b6053b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -84,6 +84,8 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +extern unsigned long global_dirty_limit; + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern unsigned long dirty_background_bytes; @@ -119,6 +121,10 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time); void page_writeback_init(void); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f3e1b46ace5..da959952b9f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -116,6 +116,7 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ +unsigned long global_dirty_limit; /* * Scale the writeback cache size proportional to the relative writeout speeds. @@ -516,7 +517,67 @@ out: bdi->avg_write_bandwidth = avg; } +/* + * The global dirtyable memory and dirty threshold could be suddenly knocked + * down by a large amount (eg. on the startup of KVM in a swapless system). + * This may throw the system into deep dirty exceeded state and throttle + * heavy/light dirtiers alike. To retain good responsiveness, maintain + * global_dirty_limit for tracking slowly down to the knocked down dirty + * threshold. + */ +static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +{ + unsigned long limit = global_dirty_limit; + + /* + * Follow up in one step. + */ + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* + * Follow down slowly. Use the higher one as the target, because thresh + * may drop below dirty. This is exactly the reason to introduce + * global_dirty_limit which is guaranteed to lie above the dirty pages. + */ + thresh = max(thresh, dirty); + if (limit > thresh) { + limit -= (limit - thresh) >> 5; + goto update; + } + return; +update: + global_dirty_limit = limit; +} + +static void global_update_bandwidth(unsigned long thresh, + unsigned long dirty, + unsigned long now) +{ + static DEFINE_SPINLOCK(dirty_lock); + static unsigned long update_time; + + /* + * check locklessly first to optimize away locking for the most time + */ + if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + return; + + spin_lock(&dirty_lock); + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { + update_dirty_limit(thresh, dirty); + update_time = now; + } + spin_unlock(&dirty_lock); +} + void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time) { unsigned long now = jiffies; @@ -538,6 +599,9 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) goto snapshot; + if (thresh) + global_update_bandwidth(thresh, dirty, now); + bdi_update_write_bandwidth(bdi, elapsed, written); snapshot: @@ -546,12 +610,17 @@ snapshot: } static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time) { if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) return; spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, start_time); + __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, + start_time); spin_unlock(&bdi->wb.list_lock); } @@ -630,7 +699,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; - bdi_update_bandwidth(bdi, start_time); + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_thresh, bdi_dirty, start_time); /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked -- cgit v1.2.3 From ffd1f609ab10532e8137b4b981fdf903ef4d0b32 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 19 Jun 2011 22:18:42 -0600 Subject: writeback: introduce max-pause and pass-good dirty limits The max-pause limit helps to keep the sleep time inside balance_dirty_pages() within MAX_PAUSE=200ms. The 200ms max sleep means per task rate limit of 8pages/200ms=160KB/s when dirty exceeded, which normally is enough to stop dirtiers from continue pushing the dirty pages high, unless there are a sufficient large number of slow dirtiers (eg. 500 tasks doing 160KB/s will still sum up to 80MB/s, exceeding the write bandwidth of a slow disk and hence accumulating more and more dirty pages). The pass-good limit helps to let go of the good bdi's in the presence of a blocked bdi (ie. NFS server not responding) or slow USB disk which for some reason build up a large number of initial dirty pages that refuse to go away anytime soon. For example, given two bdi's A and B and the initial state bdi_thresh_A = dirty_thresh / 2 bdi_thresh_B = dirty_thresh / 2 bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 Then A get blocked, after a dozen seconds bdi_thresh_A = 0 bdi_thresh_B = dirty_thresh bdi_dirty_A = dirty_thresh / 2 bdi_dirty_B = dirty_thresh / 2 The (bdi_dirty_B < bdi_thresh_B) test is now useless and the dirty pages will be effectively throttled by condition (nr_dirty < dirty_thresh). This has two problems: (1) we lose the protections for light dirtiers (2) balance_dirty_pages() effectively becomes IO-less because the (bdi_nr_reclaimable > bdi_thresh) test won't be true. This is good for IO, but balance_dirty_pages() loses an important way to break out of the loop which leads to more spread out throttle delays. DIRTY_PASSGOOD_AREA can eliminate the above issues. The only problem is, DIRTY_PASSGOOD_AREA needs to be defined as 2 to fully cover the above example while this patch uses the more conservative value 8 so as not to surprise people with too many dirty pages than expected. The max-pause limit won't noticeably impact the speed dirty pages are knocked down when there is a sudden drop of global/bdi dirty thresholds. Because the heavy dirties will be throttled below 160KB/s which is slow enough. It does help to avoid long dirty throttle delays and especially will make light dirtiers more responsive. Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 21 +++++++++++++++++++++ mm/page-writeback.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'mm') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e9d371b6053b..b625073b80c8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,27 @@ #include #include +/* + * The 1/16 region above the global dirty limit will be put to maximum pauses: + * + * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) + * + * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put + * to loops: + * + * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) + * + * Further beyond, all dirtier tasks will enter a loop waiting (possibly long + * time) for the dirty pages to drop, unless written enough pages. + * + * The global dirty threshold is normally equal to the global dirty limit, + * except when the system suddenly allocates a lot of anonymous memory and + * knocks down the global dirty threshold quickly, in which case the global + * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. + */ +#define DIRTY_MAXPAUSE_AREA 16 +#define DIRTY_PASSGOOD_AREA 8 + struct backing_dev_info; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index da959952b9f5..798842a22474 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,6 +36,11 @@ #include #include +/* + * Sleep at most 200ms at a time in balance_dirty_pages(). + */ +#define MAX_PAUSE max(HZ/5, 1) + /* * Estimate write bandwidth at 200ms intervals. */ @@ -399,6 +404,11 @@ unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } +static unsigned long hard_dirty_limit(unsigned long thresh) +{ + return max(thresh, global_dirty_limit); +} + /* * global_dirty_limits - background-writeback and dirty-throttling thresholds * @@ -723,6 +733,29 @@ static void balance_dirty_pages(struct address_space *mapping, io_schedule_timeout(pause); trace_balance_dirty_wait(bdi); + dirty_thresh = hard_dirty_limit(dirty_thresh); + /* + * max-pause area. If dirty exceeded but still within this + * area, no need to sleep for more than 200ms: (a) 8 pages per + * 200ms is typically more than enough to curb heavy dirtiers; + * (b) the pause time limit makes the dirtiers more responsive. + */ + if (nr_dirty < dirty_thresh + + dirty_thresh / DIRTY_MAXPAUSE_AREA && + time_after(jiffies, start_time + MAX_PAUSE)) + break; + /* + * pass-good area. When some bdi gets blocked (eg. NFS server + * not responding), or write bandwidth dropped dramatically due + * to concurrent reads, or dirty threshold suddenly dropped and + * the dirty pages cannot be brought down anytime soon (eg. on + * slow USB stick), at least let go of the good bdi's. + */ + if (nr_dirty < dirty_thresh + + dirty_thresh / DIRTY_PASSGOOD_AREA && + bdi_dirty < bdi_thresh) + break; + /* * Increase the delay for each loop, up to our previous * default of taking a 100ms nap. -- cgit v1.2.3 From e1cbe236013c82bcf9a156e98d7b47efb89d2674 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 6 Dec 2010 22:34:29 -0600 Subject: writeback: trace global_dirty_state Add trace event balance_dirty_state for showing the global dirty page counts and thresholds at each global_dirty_limits() invocation. This will cover the callers throttle_vm_writeout(), over_bground_thresh() and each balance_dirty_pages() loop. Signed-off-by: Wu Fengguang --- include/trace/events/writeback.h | 46 ++++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 1 + 2 files changed, 47 insertions(+) (limited to 'mm') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3e7662a0cfa3..6bca4cc0063c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -204,6 +204,52 @@ TRACE_EVENT(writeback_queue_io, __entry->moved) ); +TRACE_EVENT(global_dirty_state, + + TP_PROTO(unsigned long background_thresh, + unsigned long dirty_thresh + ), + + TP_ARGS(background_thresh, + dirty_thresh + ), + + TP_STRUCT__entry( + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_unstable) + __field(unsigned long, background_thresh) + __field(unsigned long, dirty_thresh) + __field(unsigned long, dirty_limit) + __field(unsigned long, nr_dirtied) + __field(unsigned long, nr_written) + ), + + TP_fast_assign( + __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); + __entry->nr_writeback = global_page_state(NR_WRITEBACK); + __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); + __entry->nr_dirtied = global_page_state(NR_DIRTIED); + __entry->nr_written = global_page_state(NR_WRITTEN); + __entry->background_thresh = background_thresh; + __entry->dirty_thresh = dirty_thresh; + __entry->dirty_limit = global_dirty_limit; + ), + + TP_printk("dirty=%lu writeback=%lu unstable=%lu " + "bg_thresh=%lu thresh=%lu limit=%lu " + "dirtied=%lu written=%lu", + __entry->nr_dirty, + __entry->nr_writeback, + __entry->nr_unstable, + __entry->background_thresh, + __entry->dirty_thresh, + __entry->dirty_limit, + __entry->nr_dirtied, + __entry->nr_written + ) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 798842a22474..f9d9f5476d58 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -447,6 +447,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) } *pbackground = background; *pdirty = dirty; + trace_global_dirty_state(background, dirty); } /** -- cgit v1.2.3 From 1d07171c5e58e68a76a141970a3a5e816a414ce6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jul 2011 12:49:12 -0500 Subject: slub: disable interrupts in cmpxchg_double_slab when falling back to pagelock Split cmpxchg_double_slab into two functions. One for the case where we know that interrupts are disabled (and therefore the fallback does not need to disable interrupts) and one for the other cases where fallback will also disable interrupts. This fixes the issue that __slab_free called cmpxchg_double_slab in some scenarios without disabling interrupts. Tested-by: Hugh Dickins Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 78c488202f7d..7836b45ea1fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -354,6 +354,42 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, @@ -368,14 +404,19 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, } else #endif { + unsigned long flags; + + local_irq_save(flags); slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; slab_unlock(page); + local_irq_restore(flags); return 1; } slab_unlock(page); + local_irq_restore(flags); } cpu_relax(); @@ -1471,7 +1512,7 @@ static inline int acquire_slab(struct kmem_cache *s, VM_BUG_ON(new.frozen); new.frozen = 1; - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, freelist, counters, NULL, new.counters, "lock and freeze")); @@ -1709,7 +1750,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) new.inuse--; VM_BUG_ON(!new.frozen); - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, prior, counters, freelist, new.counters, "drain percpu freelist")); @@ -1798,7 +1839,7 @@ redo: } l = m; - if (!cmpxchg_double_slab(s, page, + if (!__cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) @@ -1992,7 +2033,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, new.inuse = page->objects; new.frozen = object != NULL; - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, object, counters, NULL, new.counters, "__slab_alloc")); -- cgit v1.2.3 From c225150b86fef9f7663219b6e9f7606ea1607312 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 11 Jul 2011 13:35:08 -0700 Subject: slab: fix DEBUG_SLAB build Fix CONFIG_SLAB=y CONFIG_DEBUG_SLAB=y build error and warnings. Now that ARCH_SLAB_MINALIGN defaults to __alignof__(unsigned long long), it is always defined (when slab.h included), but cannot be used in #if: mm/slab.c: In function `cache_alloc_debugcheck_after': mm/slab.c:3156:5: warning: "__alignof__" is not defined mm/slab.c:3156:5: error: missing binary operator before token "(" make[1]: *** [mm/slab.o] Error 1 So just remove the #if and #endif lines, but then 64-bit build warns: mm/slab.c: In function `cache_alloc_debugcheck_after': mm/slab.c:3156:6: warning: cast from pointer to integer of different size mm/slab.c:3158:10: warning: format `%d' expects type `int', but argument 3 has type `long unsigned int' Fix those with casts, whatever the actual type of ARCH_SLAB_MINALIGN. Acked-by: Christoph Lameter Signed-off-by: Hugh Dickins Signed-off-by: Pekka Enberg --- mm/slab.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bcfa4987c8ae..ef8ceb726e71 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3153,12 +3153,10 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); -#if ARCH_SLAB_MINALIGN - if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + if ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", - objp, ARCH_SLAB_MINALIGN); + objp, (int)ARCH_SLAB_MINALIGN); } -#endif return objp; } #else -- cgit v1.2.3 From 9d8f13ba3f4833219e50767b022b82cd0da930eb Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 6 Jun 2011 15:29:25 -0400 Subject: security: new security_inode_init_security API adds function callback This patch changes the security_inode_init_security API by adding a filesystem specific callback to write security extended attributes. This change is in preparation for supporting the initialization of multiple LSM xattrs and the EVM xattr. Initially the callback function walks an array of xattrs, writing each xattr separately, but could be optimized to write multiple xattrs at once. For existing security_inode_init_security() calls, which have not yet been converted to use the new callback function, such as those in reiserfs and ocfs2, this patch defines security_old_inode_init_security(). Signed-off-by: Mimi Zohar --- fs/btrfs/xattr.c | 50 +++++++++++++++++++------------------- fs/ext2/xattr_security.c | 34 +++++++++++++------------- fs/ext3/xattr_security.c | 36 +++++++++++++++------------- fs/ext4/xattr_security.c | 36 +++++++++++++++------------- fs/gfs2/inode.c | 38 ++++++++++++++--------------- fs/jffs2/security.c | 35 ++++++++++++++------------- fs/jfs/xattr.c | 57 ++++++++++++++++++++++---------------------- fs/ocfs2/xattr.c | 38 ++++++++++++++++++----------- fs/reiserfs/xattr_security.c | 4 ++-- fs/xfs/linux-2.6/xfs_iops.c | 39 +++++++++++++++--------------- include/linux/security.h | 17 +++++++++---- include/linux/xattr.h | 6 +++++ mm/shmem.c | 4 ++-- security/security.c | 39 +++++++++++++++++++++++++++--- 14 files changed, 250 insertions(+), 183 deletions(-) (limited to 'mm') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5366fe452ab0..a039e6ed4ce0 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -360,36 +360,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; char *name; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, - GFP_NOFS); - if (!name) { - err = -ENOMEM; - } else { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(trans, inode, name, value, len, 0); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); kfree(name); + if (err < 0) + break; } - - kfree(suffix); - kfree(value); return err; } + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 5d979b4347b0..c922adc8ef41 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int -ext2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext2_initxattrs, NULL); +} + const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index b8d9f83aa5c5..3c218b8a51d4 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 007c3bfbf094..34e4350dd4d9 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 03e0c529063e..1d3a1a651721 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -624,31 +624,29 @@ fail: return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, - const struct qstr *qstr) +int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; - - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, - &name, &value, &len); - - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; } - - err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, - GFS2_EATYPE_SECURITY); - kfree(value); - kfree(name); - return err; } +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) +{ + return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, + &gfs2_initxattrs, NULL); +} + /** * gfs2_create_inode - Create a new inode * @dir: The parent directory diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index cfeb7164b085..0f20208df602 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -22,26 +22,29 @@ #include #include "nodelist.h" -/* ---- Initial Security Label Attachment -------------- */ -int jffs2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +/* ---- Initial Security Label(s) Attachment callback --- */ +int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0); + return err; +} - kfree(name); - kfree(value); - return rc; +/* ---- Initial Security Label(s) Attachment ----------- */ +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jffs2_initxattrs, NULL); } /* ---- XATTR Handler for "security.*" ----------------- */ diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24838f1eeee5..e982509292f8 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -1091,38 +1091,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + tid_t *tid = fs_info; char *name; - - rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; - } - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix), - GFP_NOFS); - if (!name) { - rc = -ENOMEM; - goto kmalloc_failed; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + err = __jfs_setxattr(*tid, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - - rc = __jfs_setxattr(tid, inode, name, value, len, 0); - - kfree(name); -kmalloc_failed: - kfree(suffix); - kfree(value); + return err; +} - return rc; +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jfs_initxattrs, &tid); } #endif diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 81ecf9c0bf0a..194fb22ef79d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7185,20 +7185,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, { int ret = 0; struct buffer_head *dir_bh = NULL; - struct ocfs2_security_xattr_info si = { - .enable = 1, - }; - ret = ocfs2_init_security_get(inode, dir, qstr, &si); + ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (!ret) { - ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, - si.name, si.value, si.value_len, - XATTR_CREATE); - if (ret) { - mlog_errno(ret); - goto leave; - } - } else if (ret != -EOPNOTSUPP) { mlog_errno(ret); goto leave; } @@ -7255,6 +7244,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } +int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, XATTR_CREATE); + if (err) + break; + } + return err; +} + int ocfs2_init_security_get(struct inode *inode, struct inode *dir, const struct qstr *qstr, @@ -7263,8 +7268,13 @@ int ocfs2_init_security_get(struct inode *inode, /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - return security_inode_init_security(inode, dir, qstr, &si->name, - &si->value, &si->value_len); + if (si) + return security_old_inode_init_security(inode, dir, qstr, + &si->name, &si->value, + &si->value_len); + + return security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, NULL); } int ocfs2_init_security_set(handle_t *handle, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ef66c18a9332..534668fa41be 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_inode_init_security(inode, dir, qstr, &sec->name, - &sec->value, &sec->length); + error = security_old_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); if (error) { if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index d44d92cd12b1..27a3658b830f 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -93,37 +93,38 @@ xfs_mark_inode_dirty( mark_inode_dirty(inode); } + +int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; +} + /* * Hook in SELinux. This is not quite correct yet, what we really need * here (as we do for default ACLs) is a mechanism by which creation of * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ + STATIC int xfs_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) { - struct xfs_inode *ip = XFS_I(inode); - size_t length; - void *value; - unsigned char *name; - int error; - - error = security_inode_init_security(inode, dir, qstr, (char **)&name, - &value, &length); - if (error) { - if (error == -EOPNOTSUPP) - return 0; - return -error; - } - - error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); - - kfree(name); - kfree(value); - return error; + return security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void diff --git a/include/linux/security.h b/include/linux/security.h index 8ce59ef3e5af..6a20c7025495 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -36,6 +36,7 @@ #include #include #include +#include #include /* Maximum number of letters for an LSM name string */ @@ -147,6 +148,10 @@ extern int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +/* security_inode_init_security callback function to write xattrs */ +typedef int (*initxattrs) (struct inode *inode, + const struct xattr *xattr_array, void *fs_data); + #ifdef CONFIG_SECURITY struct security_mnt_opts { @@ -1704,8 +1709,11 @@ int security_sb_parse_opts_str(char *options, struct security_mnt_opts *opts); int security_inode_alloc(struct inode *inode); void security_inode_free(struct inode *inode); int security_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, - void **value, size_t *len); + const struct qstr *qstr, + initxattrs initxattrs, void *fs_data); +int security_old_inode_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, char **name, + void **value, size_t *len); int security_inode_create(struct inode *dir, struct dentry *dentry, int mode); int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry); @@ -2035,9 +2043,8 @@ static inline void security_inode_free(struct inode *inode) static inline int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, - char **name, - void **value, - size_t *len) + initxattrs initxattrs, + void *fs_data) { return -EOPNOTSUPP; } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index aed54c50aa66..7a378662ddff 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -67,6 +67,12 @@ struct xattr_handler { size_t size, int flags, int handler_flags); }; +struct xattr { + char *name; + void *value; + size_t value_len; +}; + ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb7..01c19c62d685 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1878,7 +1878,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, - &dentry->d_name, NULL, + &dentry->d_name, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { @@ -2018,7 +2018,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (!inode) return -ENOSPC; - error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, + error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { diff --git a/security/security.c b/security/security.c index 4ba6d4cc061f..3464d58a5766 100644 --- a/security/security.c +++ b/security/security.c @@ -18,6 +18,8 @@ #include #include +#define MAX_LSM_XATTR 1 + /* Boot-time LSM user choice */ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = CONFIG_DEFAULT_SECURITY; @@ -339,15 +341,46 @@ void security_inode_free(struct inode *inode) } int security_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, - void **value, size_t *len) + const struct qstr *qstr, + const initxattrs initxattrs, void *fs_data) +{ + struct xattr new_xattrs[MAX_LSM_XATTR + 1]; + struct xattr *lsm_xattr; + int ret; + + if (unlikely(IS_PRIVATE(inode))) + return -EOPNOTSUPP; + + memset(new_xattrs, 0, sizeof new_xattrs); + if (!initxattrs) + return security_ops->inode_init_security(inode, dir, qstr, + NULL, NULL, NULL); + lsm_xattr = new_xattrs; + ret = security_ops->inode_init_security(inode, dir, qstr, + &lsm_xattr->name, + &lsm_xattr->value, + &lsm_xattr->value_len); + if (ret) + goto out; + ret = initxattrs(inode, new_xattrs, fs_data); +out: + kfree(lsm_xattr->name); + kfree(lsm_xattr->value); + + return (ret == -EOPNOTSUPP) ? 0 : ret; +} +EXPORT_SYMBOL(security_inode_init_security); + +int security_old_inode_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, char **name, + void **value, size_t *len) { if (unlikely(IS_PRIVATE(inode))) return -EOPNOTSUPP; return security_ops->inode_init_security(inode, dir, qstr, name, value, len); } -EXPORT_SYMBOL(security_inode_init_security); +EXPORT_SYMBOL(security_old_inode_init_security); #ifdef CONFIG_SECURITY_PATH int security_path_mknod(struct path *dir, struct dentry *dentry, int mode, -- cgit v1.2.3 From 4746efded84d7c5a9c8d64d4c6e814ff0cf9fb42 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 19 Jul 2011 08:49:26 -0700 Subject: vmscan: fix a livelock in kswapd I'm running a workload which triggers a lot of swap in a machine with 4 nodes. After I kill the workload, I found a kswapd livelock. Sometimes kswapd3 or kswapd2 are keeping running and I can't access filesystem, but most memory is free. This looks like a regression since commit 08951e545918c159 ("mm: vmscan: correct check for kswapd sleeping in sleeping_prematurely"). Node 2 and 3 have only ZONE_NORMAL, but balance_pgdat() will return 0 for classzone_idx. The reason is end_zone in balance_pgdat() is 0 by default, if all zones have watermark ok, end_zone will keep 0. Later sleeping_prematurely() always returns true. Because this is an order 3 wakeup, and if classzone_idx is 0, both balanced_pages and present_pages in pgdat_balanced() are 0. We add a special case here. If a zone has no page, we think it's balanced. This fixes the livelock. Signed-off-by: Shaohua Li Acked-by: Mel Gorman Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ed24b94c5e6..d036e59d302b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2310,7 +2310,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, for (i = 0; i <= classzone_idx; i++) present_pages += pgdat->node_zones[i].present_pages; - return balanced_pages > (present_pages >> 2); + /* A special case here: if zone has no page, we think it's balanced */ + return balanced_pages >= (present_pages >> 2); } /* is kswapd sleeping prematurely? */ -- cgit v1.2.3 From 095760730c1047c69159ce88021a7fa3833502c8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:34 +1000 Subject: vmscan: add shrink_slab tracepoints It is impossible to understand what the shrinkers are actually doing without instrumenting the code, so add a some tracepoints to allow insight to be gained. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/trace/events/vmscan.h | 77 +++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 8 ++++- 2 files changed, 84 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index b2c33bd955fa..36851f7f13da 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -179,6 +179,83 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TP_ARGS(nr_reclaimed) ); +TRACE_EVENT(mm_shrink_slab_start, + TP_PROTO(struct shrinker *shr, struct shrink_control *sc, + long nr_objects_to_shrink, unsigned long pgs_scanned, + unsigned long lru_pgs, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan), + + TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, + cache_items, delta, total_scan), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, nr_objects_to_shrink) + __field(gfp_t, gfp_flags) + __field(unsigned long, pgs_scanned) + __field(unsigned long, lru_pgs) + __field(unsigned long, cache_items) + __field(unsigned long long, delta) + __field(unsigned long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->nr_objects_to_shrink = nr_objects_to_shrink; + __entry->gfp_flags = sc->gfp_mask; + __entry->pgs_scanned = pgs_scanned; + __entry->lru_pgs = lru_pgs; + __entry->cache_items = cache_items; + __entry->delta = delta; + __entry->total_scan = total_scan; + ), + + TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + __entry->shrink, + __entry->shr, + __entry->nr_objects_to_shrink, + show_gfp_flags(__entry->gfp_flags), + __entry->pgs_scanned, + __entry->lru_pgs, + __entry->cache_items, + __entry->delta, + __entry->total_scan) +); + +TRACE_EVENT(mm_shrink_slab_end, + TP_PROTO(struct shrinker *shr, int shrinker_retval, + long unused_scan_cnt, long new_scan_cnt), + + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, unused_scan) + __field(long, new_scan) + __field(int, retval) + __field(long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->unused_scan = unused_scan_cnt; + __entry->new_scan = new_scan_cnt; + __entry->retval = shrinker_retval; + __entry->total_scan = new_scan_cnt - unused_scan_cnt; + ), + + TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + __entry->shrink, + __entry->shr, + __entry->unused_scan, + __entry->new_scan, + __entry->total_scan, + __entry->retval) +); DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, diff --git a/mm/vmscan.c b/mm/vmscan.c index d036e59d302b..255226554a3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -250,6 +250,7 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -274,9 +275,12 @@ unsigned long shrink_slab(struct shrink_control *shrink, total_scan = shrinker->nr; shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, total_scan, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); + while (total_scan >= SHRINK_BATCH) { long this_scan = SHRINK_BATCH; - int shrink_ret; int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); @@ -293,6 +297,8 @@ unsigned long shrink_slab(struct shrink_control *shrink, } shrinker->nr += total_scan; + trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan, + shrinker->nr); } up_read(&shrinker_rwsem); out: -- cgit v1.2.3 From acf92b485cccf028177f46918e045c0c4e80ee10 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:35 +1000 Subject: vmscan: shrinker->nr updates race and go wrong shrink_slab() allows shrinkers to be called in parallel so the struct shrinker can be updated concurrently. It does not provide any exclusio for such updates, so we can get the shrinker->nr value increasing or decreasing incorrectly. As a result, when a shrinker repeatedly returns a value of -1 (e.g. a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire, sometimes updating with the scan count that wasn't used, sometimes losing it altogether. Worse is when a shrinker does work and that update is lost due to racy updates, which means the shrinker will do the work again! Fix this by making the total_scan calculations independent of shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to other updates via cmpxchg loops. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- mm/vmscan.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 255226554a3d..2f7c6ae8fae0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -251,17 +251,29 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long total_scan; unsigned long max_pass; int shrink_ret = 0; + long nr; + long new_nr; + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + + total_scan = nr; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* @@ -269,13 +281,10 @@ unsigned long shrink_slab(struct shrink_control *shrink, * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; - - total_scan = shrinker->nr; - shrinker->nr = 0; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - trace_mm_shrink_slab_start(shrinker, shrink, total_scan, + trace_mm_shrink_slab_start(shrinker, shrink, nr, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); @@ -296,9 +305,19 @@ unsigned long shrink_slab(struct shrink_control *shrink, cond_resched(); } - shrinker->nr += total_scan; - trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan, - shrinker->nr); + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); out: -- cgit v1.2.3 From 3567b59aa80ac4417002bf58e35dce5c777d4164 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:36 +1000 Subject: vmscan: reduce wind up shrinker->nr when shrinker can't do work When a shrinker returns -1 to shrink_slab() to indicate it cannot do any work given the current memory reclaim requirements, it adds the entire total_scan count to shrinker->nr. The idea ehind this is that whenteh shrinker is next called and can do work, it will do the work of the previously aborted shrinker call as well. However, if a filesystem is doing lots of allocation with GFP_NOFS set, then we get many, many more aborts from the shrinkers than we do successful calls. The result is that shrinker->nr winds up to it's maximum permissible value (twice the current cache size) and then when the next shrinker call that can do work is issued, it has enough scan count built up to free the entire cache twice over. This manifests itself in the cache going from full to empty in a matter of seconds, even when only a small part of the cache is needed to be emptied to free sufficient memory. Under metadata intensive workloads on ext4 and XFS, I'm seeing the VFS caches increase memory consumption up to 75% of memory (no page cache pressure) over a period of 30-60s, and then the shrinker empties them down to zero in the space of 2-3s. This cycle repeats over and over again, with the shrinker completely trashing the inode and dentry caches every minute or so the workload continues. This behaviour was made obvious by the shrink_slab tracepoints added earlier in the series, and made worse by the patch that corrected the concurrent accounting of shrinker->nr. To avoid this problem, stop repeated small increments of the total scan value from winding shrinker->nr up to a value that can cause the entire cache to be freed. We still need to allow it to wind up, so use the delta as the "large scan" threshold check - if the delta is more than a quarter of the entire cache size, then it is a large scan and allowed to cause lots of windup because we are clearly needing to free lots of memory. If it isn't a large scan then limit the total scan to half the size of the cache so that windup never increases to consume the whole cache. Reducing the total scan limit further does not allow enough wind-up to maintain the current levels of performance, whilst a higher threshold does not prevent the windup from freeing the entire cache under sustained workloads. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- mm/vmscan.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2f7c6ae8fae0..387422470c95 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -276,6 +276,21 @@ unsigned long shrink_slab(struct shrink_control *shrink, total_scan = max_pass; } + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of -- cgit v1.2.3 From e9299f5058595a655c3b207cda9635e28b9197e6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:37 +1000 Subject: vmscan: add customisable shrinker batch size For shrinkers that have their own cond_resched* calls, having shrink_slab break the work down into small batches is not paticularly efficient. Add a custom batchsize field to the struct shrinker so that shrinkers can use a larger batch size if they desire. A value of zero (uninitialised) means "use the default", so behaviour is unchanged by this patch. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- include/linux/mm.h | 1 + mm/vmscan.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9670f71d7be9..9b9777ac726d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1150,6 +1150,7 @@ struct shrink_control { struct shrinker { int (*shrink)(struct shrinker *, struct shrink_control *sc); int seeks; /* seeks to recreate an obj */ + long batch; /* reclaim batch size, 0 = default */ /* These are for internal use */ struct list_head list; diff --git a/mm/vmscan.c b/mm/vmscan.c index 387422470c95..febbc044e792 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -253,6 +253,8 @@ unsigned long shrink_slab(struct shrink_control *shrink, int shrink_ret = 0; long nr; long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; /* * copy the current shrinker scan count into a local variable @@ -303,19 +305,18 @@ unsigned long shrink_slab(struct shrink_control *shrink, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; + while (total_scan >= batch_size) { int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); shrink_ret = do_shrinker_shrink(shrinker, shrink, - this_scan); + batch_size); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } -- cgit v1.2.3 From b56efcf0a45aa7fc32de90d5f9838541082fbc19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jul 2011 19:04:23 +0200 Subject: slab: shrink sizeof(struct kmem_cache) Reduce high order allocations for some setups. (NR_CPUS=4096 -> we need 64KB per kmem_cache struct) We now allocate exact needed size (using nr_cpu_ids and nr_node_ids) This also makes code a bit smaller on x86_64, since some field offsets are less than the 127 limit : Before patch : # size mm/slab.o text data bss dec hex filename 22605 361665 32 384302 5dd2e mm/slab.o After patch : # size mm/slab.o text data bss dec hex filename 22349 353473 8224 384046 5dc2e mm/slab.o CC: Andrew Morton Reported-by: Konstantin Khlebnikov Signed-off-by: Eric Dumazet Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slab_def.h | 26 +++++++++++++------------- mm/slab.c | 10 ++++++---- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index d7f63112f63c..d00e0bacda93 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -24,21 +24,19 @@ */ struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ +/* 1) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; unsigned int buffer_size; u32 reciprocal_buffer_size; -/* 3) touched by every alloc & free from the backend */ +/* 2) touched by every alloc & free from the backend */ unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ -/* 4) cache_grow/shrink */ +/* 3) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; @@ -54,11 +52,11 @@ struct kmem_cache { /* constructor func */ void (*ctor)(void *obj); -/* 5) cache creation/removal */ +/* 4) cache creation/removal */ const char *name; struct list_head next; -/* 6) statistics */ +/* 5) statistics */ #ifdef CONFIG_DEBUG_SLAB unsigned long num_active; unsigned long num_allocations; @@ -85,16 +83,18 @@ struct kmem_cache { int obj_size; #endif /* CONFIG_DEBUG_SLAB */ +/* 6) per-cpu/per-node data, touched during every alloc/free */ /* - * We put nodelists[] at the end of kmem_cache, because we want to size - * this array to nr_node_ids slots instead of MAX_NUMNODES + * We put array[] at the end of kmem_cache, because we want to size + * this array to nr_cpu_ids slots instead of NR_CPUS * (see kmem_cache_init()) - * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of nodes. + * We still use [NR_CPUS] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of cpus. */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; + struct kmem_list3 **nodelists; + struct array_cache *array[NR_CPUS]; /* - * Do not add fields after nodelists[] + * Do not add fields after array[] */ }; diff --git a/mm/slab.c b/mm/slab.c index ef8ceb726e71..c3cb3598555a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ +static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache cache_cache = { + .nodelists = cache_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void) cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* - * struct kmem_cache size depends on nr_node_ids, which - * can be less than MAX_NUMNODES. + * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + - nr_node_ids * sizeof(struct kmem_list3 *); + cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *); #if DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif @@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep) goto oops; + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; #if DEBUG cachep->obj_size = size; -- cgit v1.2.3 From 14769de93ffcaeead98bcb5771d9f88a84f7153c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:12:19 +0800 Subject: vmalloc,rcu: Convert call_rcu(rcu_free_va) to kfree_rcu() The rcu callback rcu_free_va() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(rcu_free_va). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Namhyung Kim Cc: David Rientjes Reviewed-by: Josh Triplett --- mm/vmalloc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a7..7ff9560e2f83 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -452,13 +452,6 @@ overflow: return ERR_PTR(-EBUSY); } -static void rcu_free_va(struct rcu_head *head) -{ - struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); - - kfree(va); -} - static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); @@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); - call_rcu(&va->rcu_head, rcu_free_va); + kfree_rcu(va, rcu_head); } /* -- cgit v1.2.3 From 22a3c7d188c2b7bfc8e949bf9fad215c094ba78b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:13:08 +0800 Subject: vmalloc,rcu: Convert call_rcu(rcu_free_vb) to kfree_rcu() The rcu callback rcu_free_vb() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(rcu_free_vb). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Namhyung Kim Cc: David Rientjes Reviewed-by: Josh Triplett --- mm/vmalloc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ff9560e2f83..ab8494cde007 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -830,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) return vb; } -static void rcu_free_vb(struct rcu_head *head) -{ - struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); - - kfree(vb); -} - static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; @@ -849,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb) BUG_ON(tmp != vb); free_vmap_area_noflush(vb->va); - call_rcu(&vb->rcu_head, rcu_free_vb); + kfree_rcu(vb, rcu_head); } static void purge_fragmented_blocks(int cpu) -- cgit v1.2.3 From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:43 -0400 Subject: fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/attr.c | 5 +---- fs/direct-io.c | 65 +++++++++++++++++++++++++++++++++++++++++------------ fs/inode.c | 3 +-- fs/ntfs/file.c | 3 +-- fs/ntfs/inode.c | 10 ++------- fs/ocfs2/aops.c | 7 +++--- fs/ocfs2/file.c | 15 ++++++------- fs/reiserfs/xattr.c | 3 +-- include/linux/fs.h | 11 +++++++-- mm/filemap.c | 3 --- mm/madvise.c | 2 +- mm/rmap.c | 1 - mm/truncate.c | 3 +-- 13 files changed, 78 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/fs/attr.c b/fs/attr.c index caf2aa521e2b..f177ac86fa48 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return error; if (ia_valid & ATTR_SIZE) - down_write(&dentry->d_inode->i_alloc_sem); + inode_dio_wait(inode); if (inode->i_op->setattr) error = inode->i_op->setattr(dentry, attr); else error = simple_setattr(dentry, attr); - if (ia_valid & ATTR_SIZE) - up_write(&dentry->d_inode->i_alloc_sem); - if (!error) fsnotify_change(dentry, ia_valid); diff --git a/fs/direct-io.c b/fs/direct-io.c index 98ce3ac0d94b..354cbdbc14bd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -135,6 +135,50 @@ struct dio { struct page *pages[DIO_PAGES]; /* page buffer */ }; +static void __inode_dio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + do { + prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&inode->i_dio_count)) + schedule(); + } while (atomic_read(&inode->i_dio_count)); + finish_wait(wq, &q.wait); +} + +/** + * inode_dio_wait - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +void inode_dio_wait(struct inode *inode) +{ + if (atomic_read(&inode->i_dio_count)) + __inode_dio_wait(inode); +} +EXPORT_SYMBOL_GPL(inode_dio_wait); + +/* + * inode_dio_done - signal finish of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +void inode_dio_done(struct inode *inode) +{ + if (atomic_dec_and_test(&inode->i_dio_count)) + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); +} +EXPORT_SYMBOL_GPL(inode_dio_done); + /* * How many pages are in the queue? */ @@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is } if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - + inode_dio_done(dio->inode); return ret; } @@ -980,9 +1022,6 @@ out: return ret; } -/* - * Releases both i_mutex and i_alloc_sem - */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, @@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * For writes this function is called under i_mutex and returns with * i_mutex held, for reads, i_mutex is not held on entry, but it is * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). + * The i_dio_count counter keeps track of the number of outstanding + * direct I/O requests, and truncate waits for it to reach zero. + * New references to i_dio_count must only be grabbed with i_mutex + * held. * * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * Will be released at I/O completion, possibly in a - * different thread. + * Will be decremented at I/O completion time. */ - down_read_non_owner(&inode->i_alloc_sem); + atomic_inc(&inode->i_dio_count); } /* diff --git a/fs/inode.c b/fs/inode.c index cf81baf1898a..96c77b81167c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mutex_init(&inode->i_mutex); lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); - init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f4b1057abdd2..b59f5ac26bef 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, * fails again. */ if (unlikely(NInoTruncateFailed(ni))) { - down_write(&vi->i_alloc_sem); + inode_dio_wait(vi); err = ntfs_truncate(vi); - up_write(&vi->i_alloc_sem); if (err || NInoTruncateFailed(ni)) { if (!err) err = -EIO; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index c05d6dcf77a4..1371487da955 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run " * * Returns 0 on success or -errno on error. * - * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for - * writing. The only case in the kernel where ->i_alloc_sem is not held is - * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called - * with the current i_size as the offset. The analogous place in NTFS is in - * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again - * without holding ->i_alloc_sem. + * Called with ->i_mutex held. */ int ntfs_truncate(struct inode *vi) { @@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) { * We also abort all changes of user, group, and mode as we do not implement * the NTFS ACLs yet. * - * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also - * called with ->i_alloc_sem held for writing. + * Called with ->i_mutex held. */ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ac97bca282d2..de1d3953599d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -551,9 +551,8 @@ bail: /* * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. Like the core uses - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from - * truncation on another. + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. */ static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, @@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); if (ocfs2_iocb_is_sem_locked(iocb)) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1406c37a5722..2c3a465514a2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, ocfs2_iocb_clear_sem_locked(iocb); relock: - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { - down_read(&inode->i_alloc_sem); + atomic_inc(&inode->i_dio_count); have_alloc_sem = 1; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_sem_locked(iocb); @@ -2290,7 +2290,7 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); have_alloc_sem = 0; rw_level = -1; @@ -2361,8 +2361,7 @@ out_dio: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. (it's the clustered equivalent of - * i_alloc_sem; protects truncate from racing with pending ios). + * it can unlock our rw lock. * Unfortunately there are error cases which call end_io and others * that don't. so we don't have to unlock the rw_lock if either an * async dio is going to do it in the future or an end_io after an @@ -2379,7 +2378,7 @@ out: out_sems: if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } @@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * need locks to protect pending reads from racing with truncate. */ if (filp->f_flags & O_DIRECT) { - down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + atomic_inc(&inode->i_dio_count); ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); @@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, bail: if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } if (rw_level != -1) diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4ea2ab41fdee..6938d8c68d6e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - down_write(&dentry->d_inode->i_alloc_sem); + inode_dio_wait(dentry->d_inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_setattr(dentry, &newattrs); - up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); } else update_ctime(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1393742bba9b..2fe920774abf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -779,7 +779,7 @@ struct inode { struct timespec i_ctime; blkcnt_t i_blocks; unsigned short i_bytes; - struct rw_semaphore i_alloc_sem; + atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space *i_mapping; @@ -1705,6 +1705,10 @@ struct super_operations { * set during data writeback, and cleared with a wakeup * on the bit address once it is done. * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1718,6 +1722,8 @@ struct super_operations { #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) +#define __I_DIO_WAKEUP 9 +#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1828,7 +1834,6 @@ struct file_system_type { struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; - struct lock_class_key i_alloc_sem_key; }; extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, @@ -2404,6 +2409,8 @@ enum { }; void dio_end_io(struct bio *bio, int error); +void inode_dio_wait(struct inode *inode); +void inode_dio_done(struct inode *inode); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d3457..f820e600f1ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -78,9 +78,6 @@ * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_mutex - * ->i_alloc_sem (various) - * * inode_wb_list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed503..74bf193eff04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + /* vmtruncate_range needs to take i_mutex */ up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); down_read(¤t->mm->mmap_sem); diff --git a/mm/rmap.c b/mm/rmap.c index 23295f65ae43..2540a39eea4a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -21,7 +21,6 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad7..003c6c685fc8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return -ENOSYS; mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); + inode_dio_wait(inode); unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); /* unmap again to remove racily COWed private pages */ unmap_mapping_range(mapping, offset, (end - offset), 1); - up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0; -- cgit v1.2.3 From f15146380d28b746df3c8b81b392812eb982382a Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 12 Jul 2011 20:48:39 +0200 Subject: fs: seq_file - add event counter to simplify poll() support Moving the event counter into the dynamically allocated 'struc seq_file' allows poll() support without the need to allocate its own tracking structure. All current users are switched over to use the new counter. Requested-by: Andrew Morton akpm@linux-foundation.org Acked-by: NeilBrown Tested-by: Lucas De Marchi lucas.demarchi@profusion.mobi Signed-off-by: Kay Sievers Signed-off-by: Al Viro --- drivers/md/md.c | 26 ++++++++------------------ fs/namespace.c | 4 ++-- fs/proc/base.c | 2 +- include/linux/mnt_namespace.h | 1 - include/linux/seq_file.h | 1 + mm/swapfile.c | 29 ++++++++--------------------- 6 files changed, 20 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/drivers/md/md.c b/drivers/md/md.c index 91e31e260b4a..dfc9425db70b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6394,16 +6394,11 @@ static void md_seq_stop(struct seq_file *seq, void *v) mddev_put(mddev); } -struct mdstat_info { - int event; -}; - static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; sector_t sectors; mdk_rdev_t *rdev; - struct mdstat_info *mi = seq->private; struct bitmap *bitmap; if (v == (void*)1) { @@ -6415,7 +6410,7 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&pers_lock); seq_printf(seq, "\n"); - mi->event = atomic_read(&md_event_count); + seq->poll_event = atomic_read(&md_event_count); return 0; } if (v == (void*)2) { @@ -6527,26 +6522,21 @@ static const struct seq_operations md_seq_ops = { static int md_seq_open(struct inode *inode, struct file *file) { + struct seq_file *seq; int error; - struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); - if (mi == NULL) - return -ENOMEM; error = seq_open(file, &md_seq_ops); if (error) - kfree(mi); - else { - struct seq_file *p = file->private_data; - p->private = mi; - mi->event = atomic_read(&md_event_count); - } + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); return error; } static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { - struct seq_file *m = filp->private_data; - struct mdstat_info *mi = m->private; + struct seq_file *seq = filp->private_data; int mask; poll_wait(filp, &md_event_waiters, wait); @@ -6554,7 +6544,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait) /* always allow read */ mask = POLLIN | POLLRDNORM; - if (mi->event != atomic_read(&md_event_count)) + if (seq->poll_event != atomic_read(&md_event_count)) mask |= POLLERR | POLLPRI; return mask; } diff --git a/fs/namespace.c b/fs/namespace.c index fe59bd145d21..cda50fe9250a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -934,8 +934,8 @@ int mnt_had_events(struct proc_mounts *p) int res = 0; br_read_lock(vfsmount_lock); - if (p->event != ns->event) { - p->event = ns->event; + if (p->m.poll_event != ns->event) { + p->m.poll_event = ns->event; res = 1; } br_read_unlock(vfsmount_lock); diff --git a/fs/proc/base.c b/fs/proc/base.c index be1ff932033b..3dc5e2a5cc38 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -673,7 +673,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->m.private = p; p->ns = ns; p->root = root; - p->event = ns->event; + p->m.poll_event = ns->event; return 0; diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 0b89efc6f215..29304855652d 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -18,7 +18,6 @@ struct proc_mounts { struct seq_file m; /* must be the first element */ struct mnt_namespace *ns; struct path root; - int event; }; struct fs_struct; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 03c0232b4169..be720cd2038d 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -23,6 +23,7 @@ struct seq_file { u64 version; struct mutex lock; const struct seq_operations *op; + int poll_event; void *private; }; diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a18cb4..1b8c33907242 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1681,19 +1681,14 @@ out: } #ifdef CONFIG_PROC_FS -struct proc_swaps { - struct seq_file seq; - int event; -}; - static unsigned swaps_poll(struct file *file, poll_table *wait) { - struct proc_swaps *s = file->private_data; + struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); - if (s->event != atomic_read(&proc_poll_event)) { - s->event = atomic_read(&proc_poll_event); + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); return POLLIN | POLLRDNORM | POLLERR | POLLPRI; } @@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - struct proc_swaps *s; + struct seq_file *seq; int ret; - s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); - if (!s) - return -ENOMEM; - - file->private_data = s; - ret = seq_open(file, &swaps_op); - if (ret) { - kfree(s); + if (ret) return ret; - } - s->seq.private = s; - s->event = atomic_read(&proc_poll_event); - return ret; + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; } static const struct file_operations proc_swaps_operations = { -- cgit v1.2.3 From 497888cf69bf607ac1fe061a6437e0a670b0022f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Thu, 14 Jul 2011 15:07:13 +0300 Subject: treewide: fix potentially dangerous trailing ';' in #defined values/expressions All these are instances of #define NAME value; or #define NAME(params_opt) value; These of course fail to build when used in contexts like if(foo $OP NAME) while(bar $OP NAME) and may silently generate the wrong code in contexts such as foo = NAME + 1; /* foo = value; + 1; */ bar = NAME - 1; /* bar = value; - 1; */ baz = NAME & quux; /* baz = value; & quux; */ Reported on comp.lang.c, Message-ID: Initial analysis of the dangers provided by Keith Thompson in that thread. There are many more instances of more complicated macros having unnecessary trailing semicolons, but this pile seems to be all of the cases of simple values suffering from the problem. (Thus things that are likely to be found in one of the contexts above, more complicated ones aren't.) Signed-off-by: Phil Carmody Signed-off-by: Jiri Kosina --- Documentation/DocBook/v4l/io.xml | 2 +- arch/alpha/include/asm/floppy.h | 2 +- arch/h8300/kernel/setup.c | 2 +- arch/ia64/include/asm/sn/tioce.h | 2 +- arch/mips/include/asm/floppy.h | 2 +- arch/parisc/include/asm/dma-mapping.h | 2 +- arch/parisc/math-emu/decode_exc.c | 2 +- arch/powerpc/include/asm/elf.h | 4 ++-- arch/powerpc/include/asm/smu.h | 2 +- arch/sparc/include/asm/elf_64.h | 2 +- arch/um/sys-i386/signal.c | 2 +- arch/x86/kernel/i387.c | 2 +- drivers/acpi/ac.c | 2 +- drivers/acpi/battery.c | 2 +- drivers/acpi/sbs.c | 2 +- drivers/gpu/drm/sis/sis_drv.h | 4 ++-- drivers/hwmon/gl520sm.c | 2 +- drivers/isdn/i4l/isdn_bsdcomp.c | 2 +- drivers/net/bsd_comp.c | 2 +- drivers/net/natsemi.c | 2 +- drivers/net/r8169.c | 2 +- drivers/net/s2io.h | 4 ++-- drivers/net/wireless/iwlegacy/iwl-commands.h | 4 ++-- drivers/net/wireless/iwlwifi/iwl-commands.h | 4 ++-- drivers/net/wireless/rtlwifi/rtl8192ce/reg.h | 4 ++-- drivers/scsi/device_handler/scsi_dh_rdac.c | 2 +- drivers/scsi/lpfc/lpfc_hw.h | 8 ++++---- drivers/usb/otg/twl4030-usb.c | 2 +- drivers/video/i810/i810.h | 2 +- include/linux/ceph/libceph.h | 2 +- include/linux/mfd/tps65910.h | 2 +- include/scsi/scsi.h | 2 +- include/sound/soundfont.h | 2 +- mm/slub.c | 2 +- net/mac80211/mesh_hwmp.c | 20 ++++++++++---------- 35 files changed, 53 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/Documentation/DocBook/v4l/io.xml b/Documentation/DocBook/v4l/io.xml index 227e7ac45a06..c57d1ec6291c 100644 --- a/Documentation/DocBook/v4l/io.xml +++ b/Documentation/DocBook/v4l/io.xml @@ -210,7 +210,7 @@ for (i = 0; i < reqbuf.count; i++) &v4l2-requestbuffers; reqbuf; /* Our current format uses 3 planes per buffer */ -#define FMT_NUM_PLANES = 3; +#define FMT_NUM_PLANES = 3 struct { void *start[FMT_NUM_PLANES]; diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h index 0be50413b2b5..46cefbd50e73 100644 --- a/arch/alpha/include/asm/floppy.h +++ b/arch/alpha/include/asm/floppy.h @@ -27,7 +27,7 @@ #define fd_cacheflush(addr,size) /* nothing */ #define fd_request_irq() request_irq(FLOPPY_IRQ, floppy_interrupt,\ IRQF_DISABLED, "floppy", NULL) -#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL); +#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL) #ifdef CONFIG_PCI diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index 7fda657110eb..68d651081bd3 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -46,7 +46,7 @@ #include #endif -#define STUBSIZE 0xc000; +#define STUBSIZE 0xc000 unsigned long rom_length; unsigned long memory_start; diff --git a/arch/ia64/include/asm/sn/tioce.h b/arch/ia64/include/asm/sn/tioce.h index 893468e1b41b..6eae8ada90f0 100644 --- a/arch/ia64/include/asm/sn/tioce.h +++ b/arch/ia64/include/asm/sn/tioce.h @@ -467,7 +467,7 @@ typedef volatile struct tioce { #define CE_LSI_GB_CFG1_RXL0S_THS_SHFT 0 #define CE_LSI_GB_CFG1_RXL0S_THS_MASK (0xffULL << 0) #define CE_LSI_GB_CFG1_RXL0S_SMP_SHFT 8 -#define CE_LSI_GB_CFG1_RXL0S_SMP_MASK (0xfULL << 8); +#define CE_LSI_GB_CFG1_RXL0S_SMP_MASK (0xfULL << 8) #define CE_LSI_GB_CFG1_RXL0S_ADJ_SHFT 12 #define CE_LSI_GB_CFG1_RXL0S_ADJ_MASK (0x7ULL << 12) #define CE_LSI_GB_CFG1_RXL0S_FLT_SHFT 15 diff --git a/arch/mips/include/asm/floppy.h b/arch/mips/include/asm/floppy.h index c5c7c0e6064c..4456c9c47e21 100644 --- a/arch/mips/include/asm/floppy.h +++ b/arch/mips/include/asm/floppy.h @@ -29,7 +29,7 @@ static inline void fd_cacheflush(char * addr, long size) #define FLOPPY0_TYPE fd_drive_type(0) #define FLOPPY1_TYPE fd_drive_type(1) -#define FDC1 fd_getfdaddr1(); +#define FDC1 fd_getfdaddr1() #define N_FDC 1 /* do you *really* want a second controller? */ #define N_DRIVE 8 diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h index 4ef73b09b168..890531e32fe8 100644 --- a/arch/parisc/include/asm/dma-mapping.h +++ b/arch/parisc/include/asm/dma-mapping.h @@ -210,7 +210,7 @@ parisc_walk_tree(struct device *dev) return dev->platform_data; } -#define GET_IOC(dev) (HBA_DATA(parisc_walk_tree(dev))->iommu); +#define GET_IOC(dev) (HBA_DATA(parisc_walk_tree(dev))->iommu) #ifdef CONFIG_IOMMU_CCIO diff --git a/arch/parisc/math-emu/decode_exc.c b/arch/parisc/math-emu/decode_exc.c index 27a7492ddb0d..04e550e76ae8 100644 --- a/arch/parisc/math-emu/decode_exc.c +++ b/arch/parisc/math-emu/decode_exc.c @@ -56,7 +56,7 @@ /* General definitions */ #define DOESTRAP 1 #define NOTRAP 0 -#define SIGNALCODE(signal, code) ((signal) << 24 | (code)); +#define SIGNALCODE(signal, code) ((signal) << 24 | (code)) #define copropbit 1<<31-2 /* bit position 2 */ #define opclass 9 /* bits 21 & 22 */ #define fmt 11 /* bits 19 & 20 */ diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index 2b917c69ed15..3bf9cca35147 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -267,7 +267,7 @@ extern int ucache_bsize; struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); -#define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); +#define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b) /* 1GB for 64bit, 8MB for 32bit */ #define STACK_RND_MASK (is_32bit_task() ? \ @@ -298,7 +298,7 @@ do { \ NEW_AUX_ENT(AT_DCACHEBSIZE, dcache_bsize); \ NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize); \ NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize); \ - VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base) \ + VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base); \ } while (0) /* PowerPC64 relocations defined by the ABIs */ diff --git a/arch/powerpc/include/asm/smu.h b/arch/powerpc/include/asm/smu.h index e3bdada8c542..ae20ce1af4c7 100644 --- a/arch/powerpc/include/asm/smu.h +++ b/arch/powerpc/include/asm/smu.h @@ -547,7 +547,7 @@ struct smu_sdbp_header { * (currently, afaik, this concerns only the FVT partition * (0x12) */ -#define SMU_U16_MIX(x) le16_to_cpu(x); +#define SMU_U16_MIX(x) le16_to_cpu(x) #define SMU_U32_MIX(x) ((((x) & 0xff00ff00u) >> 8)|(((x) & 0x00ff00ffu) << 8)) diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h index e67880381b84..cfa9cd2e5519 100644 --- a/arch/sparc/include/asm/elf_64.h +++ b/arch/sparc/include/asm/elf_64.h @@ -186,7 +186,7 @@ static inline unsigned int sparc64_elf_hwcap(void) return cap; } -#define ELF_HWCAP sparc64_elf_hwcap(); +#define ELF_HWCAP sparc64_elf_hwcap() /* This yields a string that ld.so will use to load implementation specific libraries for optimization. This is more specific in diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c index 129647375a6c..89a46626bfd8 100644 --- a/arch/um/sys-i386/signal.c +++ b/arch/um/sys-i386/signal.c @@ -58,7 +58,7 @@ static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) unsigned long ret = 0xffff0000; int i; -#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16); +#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) for (i = 0; i < 8; i++) { if (twd & 0x1) { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 12aff2537682..739d8598f789 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) return tmp; } -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) #define FP_EXP_TAG_VALID 0 #define FP_EXP_TAG_ZERO 1 #define FP_EXP_TAG_SPECIAL 2 diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 58c3f74bd84c..6512b20aeccd 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -89,7 +89,7 @@ struct acpi_ac { unsigned long long state; }; -#define to_acpi_ac(x) container_of(x, struct acpi_ac, charger); +#define to_acpi_ac(x) container_of(x, struct acpi_ac, charger) #ifdef CONFIG_ACPI_PROCFS_POWER static const struct file_operations acpi_ac_fops = { diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index fcc13ac0aa18..2c661353e8f2 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -132,7 +132,7 @@ struct acpi_battery { unsigned long flags; }; -#define to_acpi_battery(x) container_of(x, struct acpi_battery, bat); +#define to_acpi_battery(x) container_of(x, struct acpi_battery, bat) inline int acpi_battery_present(struct acpi_battery *battery) { diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 51ae3794ec7f..50658ff887d9 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -112,7 +112,7 @@ struct acpi_battery { u8 have_sysfs_alarm:1; }; -#define to_acpi_battery(x) container_of(x, struct acpi_battery, bat); +#define to_acpi_battery(x) container_of(x, struct acpi_battery, bat) struct acpi_sbs { struct power_supply charger; diff --git a/drivers/gpu/drm/sis/sis_drv.h b/drivers/gpu/drm/sis/sis_drv.h index ef940bad63f7..194303c177ad 100644 --- a/drivers/gpu/drm/sis/sis_drv.h +++ b/drivers/gpu/drm/sis/sis_drv.h @@ -48,8 +48,8 @@ enum sis_family { #define SIS_BASE (dev_priv->mmio) -#define SIS_READ(reg) DRM_READ32(SIS_BASE, reg); -#define SIS_WRITE(reg, val) DRM_WRITE32(SIS_BASE, reg, val); +#define SIS_READ(reg) DRM_READ32(SIS_BASE, reg) +#define SIS_WRITE(reg, val) DRM_WRITE32(SIS_BASE, reg, val) typedef struct drm_sis_private { drm_local_map_t *mmio; diff --git a/drivers/hwmon/gl520sm.c b/drivers/hwmon/gl520sm.c index ec588026f0a9..131ea8625f08 100644 --- a/drivers/hwmon/gl520sm.c +++ b/drivers/hwmon/gl520sm.c @@ -273,7 +273,7 @@ static SENSOR_DEVICE_ATTR(in4_max, S_IRUGO | S_IWUSR, #define DIV_FROM_REG(val) (1 << (val)) #define FAN_FROM_REG(val,div) ((val)==0 ? 0 : (480000/((val) << (div)))) -#define FAN_TO_REG(val,div) ((val)<=0?0:SENSORS_LIMIT((480000 + ((val) << ((div)-1))) / ((val) << (div)), 1, 255)); +#define FAN_TO_REG(val,div) ((val)<=0?0:SENSORS_LIMIT((480000 + ((val) << ((div)-1))) / ((val) << (div)), 1, 255)) static ssize_t get_fan_input(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/drivers/isdn/i4l/isdn_bsdcomp.c b/drivers/isdn/i4l/isdn_bsdcomp.c index 02d9918705dd..aa0b6a6f5ef4 100644 --- a/drivers/isdn/i4l/isdn_bsdcomp.c +++ b/drivers/isdn/i4l/isdn_bsdcomp.c @@ -155,7 +155,7 @@ struct bsd_db { #define LAST 255 #define MAXCODE(b) ((1 << (b)) - 1) -#define BADCODEM1 MAXCODE(MAX_BSD_BITS); +#define BADCODEM1 MAXCODE(MAX_BSD_BITS) #define BSD_HASH(prefix,suffix,hshift) ((((unsigned long)(suffix))<<(hshift)) \ ^ (unsigned long)(prefix)) diff --git a/drivers/net/bsd_comp.c b/drivers/net/bsd_comp.c index 6e99d80ec409..a9b759add187 100644 --- a/drivers/net/bsd_comp.c +++ b/drivers/net/bsd_comp.c @@ -201,7 +201,7 @@ extern void ppp_unregister_compressor (struct compressor *cp); #define LAST 255 #define MAXCODE(b) ((1 << (b)) - 1) -#define BADCODEM1 MAXCODE(MAX_BSD_BITS); +#define BADCODEM1 MAXCODE(MAX_BSD_BITS) #define BSD_HASH(prefix,suffix,hshift) ((((unsigned long)(suffix))<<(hshift)) \ ^ (unsigned long)(prefix)) diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c index 68e6b0224edd..c69f82a17c28 100644 --- a/drivers/net/natsemi.c +++ b/drivers/net/natsemi.c @@ -1382,7 +1382,7 @@ static int find_mii(struct net_device *dev) /* WCSR bits [0:4] [9:10] */ #define WCSR_RESET_SAVE 0x61f /* RFCR bits [20] [22] [27:31] */ -#define RFCR_RESET_SAVE 0xf8500000; +#define RFCR_RESET_SAVE 0xf8500000 static void natsemi_reset(struct net_device *dev) { diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 5990621fb5cd..6f3630618fa8 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -667,7 +667,7 @@ struct rtl8169_private { u32 saved_wolopts; const struct firmware *fw; -#define RTL_FIRMWARE_UNKNOWN ERR_PTR(-EAGAIN); +#define RTL_FIRMWARE_UNKNOWN ERR_PTR(-EAGAIN) }; MODULE_AUTHOR("Realtek and the Linux r8169 crew "); diff --git a/drivers/net/s2io.h b/drivers/net/s2io.h index 800b3a44e653..57a4dc7c7d7a 100644 --- a/drivers/net/s2io.h +++ b/drivers/net/s2io.h @@ -968,8 +968,8 @@ struct s2io_nic { u8 serial_num[VPD_STRING_LEN]; }; -#define RESET_ERROR 1; -#define CMD_ERROR 2; +#define RESET_ERROR 1 +#define CMD_ERROR 2 /* OS related system calls */ #ifndef readq diff --git a/drivers/net/wireless/iwlegacy/iwl-commands.h b/drivers/net/wireless/iwlegacy/iwl-commands.h index 17a1d504348e..6a5c76e7345f 100644 --- a/drivers/net/wireless/iwlegacy/iwl-commands.h +++ b/drivers/net/wireless/iwlegacy/iwl-commands.h @@ -2624,8 +2624,8 @@ struct iwl_scanstart_notification { __le32 status; } __packed; -#define SCAN_OWNER_STATUS 0x1; -#define MEASURE_OWNER_STATUS 0x2; +#define SCAN_OWNER_STATUS 0x1 +#define MEASURE_OWNER_STATUS 0x2 #define IWL_PROBE_STATUS_OK 0 #define IWL_PROBE_STATUS_TX_FAILED BIT(0) diff --git a/drivers/net/wireless/iwlwifi/iwl-commands.h b/drivers/net/wireless/iwlwifi/iwl-commands.h index 6ee5f1aa555c..6288d1fdfcc6 100644 --- a/drivers/net/wireless/iwlwifi/iwl-commands.h +++ b/drivers/net/wireless/iwlwifi/iwl-commands.h @@ -2457,8 +2457,8 @@ struct iwl_scanstart_notification { __le32 status; } __packed; -#define SCAN_OWNER_STATUS 0x1; -#define MEASURE_OWNER_STATUS 0x2; +#define SCAN_OWNER_STATUS 0x1 +#define MEASURE_OWNER_STATUS 0x2 #define IWL_PROBE_STATUS_OK 0 #define IWL_PROBE_STATUS_TX_FAILED BIT(0) diff --git a/drivers/net/wireless/rtlwifi/rtl8192ce/reg.h b/drivers/net/wireless/rtlwifi/rtl8192ce/reg.h index 598cecc63f41..5b43749031dd 100644 --- a/drivers/net/wireless/rtlwifi/rtl8192ce/reg.h +++ b/drivers/net/wireless/rtlwifi/rtl8192ce/reg.h @@ -1074,10 +1074,10 @@ #define _SRL(x) (((x) & 0x3F) << 8) #define _SIFS_CCK_CTX(x) ((x) & 0xFF) -#define _SIFS_CCK_TRX(x) (((x) & 0xFF) << 8); +#define _SIFS_CCK_TRX(x) (((x) & 0xFF) << 8) #define _SIFS_OFDM_CTX(x) ((x) & 0xFF) -#define _SIFS_OFDM_TRX(x) (((x) & 0xFF) << 8); +#define _SIFS_OFDM_TRX(x) (((x) & 0xFF) << 8) #define _TBTT_PROHIBIT_HOLD(x) (((x) & 0xFF) << 8) diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c index e7fc70d6b478..2e7c136bb805 100644 --- a/drivers/scsi/device_handler/scsi_dh_rdac.c +++ b/drivers/scsi/device_handler/scsi_dh_rdac.c @@ -35,7 +35,7 @@ * mode page were taken from the LSI RDAC 2.4 GPL'd * driver, and then converted to Linux conventions. */ -#define RDAC_QUIESCENCE_TIME 20; +#define RDAC_QUIESCENCE_TIME 20 /* * Page Codes */ diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h index 9059524cf225..ab4c4d651d0c 100644 --- a/drivers/scsi/lpfc/lpfc_hw.h +++ b/drivers/scsi/lpfc/lpfc_hw.h @@ -2955,18 +2955,18 @@ typedef struct _SLI2_RDSC { typedef struct _PCB { #ifdef __BIG_ENDIAN_BITFIELD uint32_t type:8; -#define TYPE_NATIVE_SLI2 0x01; +#define TYPE_NATIVE_SLI2 0x01 uint32_t feature:8; -#define FEATURE_INITIAL_SLI2 0x01; +#define FEATURE_INITIAL_SLI2 0x01 uint32_t rsvd:12; uint32_t maxRing:4; #else /* __LITTLE_ENDIAN_BITFIELD */ uint32_t maxRing:4; uint32_t rsvd:12; uint32_t feature:8; -#define FEATURE_INITIAL_SLI2 0x01; +#define FEATURE_INITIAL_SLI2 0x01 uint32_t type:8; -#define TYPE_NATIVE_SLI2 0x01; +#define TYPE_NATIVE_SLI2 0x01 #endif uint32_t mailBoxSize; diff --git a/drivers/usb/otg/twl4030-usb.c b/drivers/usb/otg/twl4030-usb.c index efeb4d1517ff..14f66c358629 100644 --- a/drivers/usb/otg/twl4030-usb.c +++ b/drivers/usb/otg/twl4030-usb.c @@ -166,7 +166,7 @@ struct twl4030_usb { }; /* internal define on top of container_of */ -#define xceiv_to_twl(x) container_of((x), struct twl4030_usb, otg); +#define xceiv_to_twl(x) container_of((x), struct twl4030_usb, otg) /*-------------------------------------------------------------------------*/ diff --git a/drivers/video/i810/i810.h b/drivers/video/i810/i810.h index f37de60ecc59..1414b73ac55b 100644 --- a/drivers/video/i810/i810.h +++ b/drivers/video/i810/i810.h @@ -137,7 +137,7 @@ #define DRAM_ON 0x08 #define DRAM_OFF 0xE7 #define PG_ENABLE_MASK 0x01 -#define RING_SIZE_MASK (RINGBUFFER_SIZE - 1); +#define RING_SIZE_MASK (RINGBUFFER_SIZE - 1) /* defines for restoring registers partially */ #define ADDR_MAP_MASK (0x07 << 5) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 6365f041745b..563755181c1e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,7 +35,7 @@ #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ -#define CEPH_OPT_DEFAULT (0); +#define CEPH_OPT_DEFAULT (0) #define ceph_set_opt(client, opt) \ (client)->options->flags |= CEPH_OPT_##opt; diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h index 8bb85b930c07..73572c65d04f 100644 --- a/include/linux/mfd/tps65910.h +++ b/include/linux/mfd/tps65910.h @@ -269,7 +269,7 @@ #define LDO1_SEL_MASK 0xFC #define LDO3_SEL_MASK 0x7C #define LDO_MIN_VOLT 1000 -#define LDO_MAX_VOLT 3300; +#define LDO_MAX_VOLT 3300 /*Register VDIG1 (0x80) register.RegisterDescription */ diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 3668903e397b..8001ae4cd7ba 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -495,7 +495,7 @@ static inline int scsi_is_wlun(unsigned int lun) #define sense_class(sense) (((sense) >> 4) & 0x7) #define sense_error(sense) ((sense) & 0xf) -#define sense_valid(sense) ((sense) & 0x80); +#define sense_valid(sense) ((sense) & 0x80) /* * default timeouts diff --git a/include/sound/soundfont.h b/include/sound/soundfont.h index f95d99ba7f74..679df0574066 100644 --- a/include/sound/soundfont.h +++ b/include/sound/soundfont.h @@ -121,7 +121,7 @@ int snd_soundfont_search_zone(struct snd_sf_list *sflist, int *notep, int vel, int snd_sf_calc_parm_hold(int msec); int snd_sf_calc_parm_attack(int msec); int snd_sf_calc_parm_decay(int msec); -#define snd_sf_calc_parm_delay(msec) (0x8000 - (msec) * 1000 / 725); +#define snd_sf_calc_parm_delay(msec) (0x8000 - (msec) * 1000 / 725) extern int snd_sf_vol_table[128]; int snd_sf_linear_to_log(unsigned int amount, int offset, int ratio); diff --git a/mm/slub.c b/mm/slub.c index 35f351f26193..5b7f7eb680e1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4058,7 +4058,7 @@ static int any_slab_objects(struct kmem_cache *s) #endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) -#define to_slab(n) container_of(n, struct kmem_cache, kobj); +#define to_slab(n) container_of(n, struct kmem_cache, kobj) struct slab_attribute { struct attribute attr; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 2b18053070c1..3460108810d5 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -57,29 +57,29 @@ static inline u32 u16_field_get(u8 *preq_elem, int offset, bool ae) #define PREQ_IE_TTL(x) (*(x + 2)) #define PREQ_IE_PREQ_ID(x) u32_field_get(x, 3, 0) #define PREQ_IE_ORIG_ADDR(x) (x + 7) -#define PREQ_IE_ORIG_SN(x) u32_field_get(x, 13, 0); -#define PREQ_IE_LIFETIME(x) u32_field_get(x, 17, AE_F_SET(x)); -#define PREQ_IE_METRIC(x) u32_field_get(x, 21, AE_F_SET(x)); +#define PREQ_IE_ORIG_SN(x) u32_field_get(x, 13, 0) +#define PREQ_IE_LIFETIME(x) u32_field_get(x, 17, AE_F_SET(x)) +#define PREQ_IE_METRIC(x) u32_field_get(x, 21, AE_F_SET(x)) #define PREQ_IE_TARGET_F(x) (*(AE_F_SET(x) ? x + 32 : x + 26)) #define PREQ_IE_TARGET_ADDR(x) (AE_F_SET(x) ? x + 33 : x + 27) -#define PREQ_IE_TARGET_SN(x) u32_field_get(x, 33, AE_F_SET(x)); +#define PREQ_IE_TARGET_SN(x) u32_field_get(x, 33, AE_F_SET(x)) #define PREP_IE_FLAGS(x) PREQ_IE_FLAGS(x) #define PREP_IE_HOPCOUNT(x) PREQ_IE_HOPCOUNT(x) #define PREP_IE_TTL(x) PREQ_IE_TTL(x) #define PREP_IE_ORIG_ADDR(x) (x + 3) -#define PREP_IE_ORIG_SN(x) u32_field_get(x, 9, 0); -#define PREP_IE_LIFETIME(x) u32_field_get(x, 13, AE_F_SET(x)); -#define PREP_IE_METRIC(x) u32_field_get(x, 17, AE_F_SET(x)); +#define PREP_IE_ORIG_SN(x) u32_field_get(x, 9, 0) +#define PREP_IE_LIFETIME(x) u32_field_get(x, 13, AE_F_SET(x)) +#define PREP_IE_METRIC(x) u32_field_get(x, 17, AE_F_SET(x)) #define PREP_IE_TARGET_ADDR(x) (AE_F_SET(x) ? x + 27 : x + 21) -#define PREP_IE_TARGET_SN(x) u32_field_get(x, 27, AE_F_SET(x)); +#define PREP_IE_TARGET_SN(x) u32_field_get(x, 27, AE_F_SET(x)) #define PERR_IE_TTL(x) (*(x)) #define PERR_IE_TARGET_FLAGS(x) (*(x + 2)) #define PERR_IE_TARGET_ADDR(x) (x + 3) -#define PERR_IE_TARGET_SN(x) u32_field_get(x, 9, 0); -#define PERR_IE_TARGET_RCODE(x) u16_field_get(x, 13, 0); +#define PERR_IE_TARGET_SN(x) u32_field_get(x, 9, 0) +#define PERR_IE_TARGET_RCODE(x) u16_field_get(x, 13, 0) #define MSEC_TO_TU(x) (x*1000/1024) #define SN_GT(x, y) ((long) (y) - (long) (x) < 0) -- cgit v1.2.3 From 7ea466f2256b02a7047dfd47d76a2f6c1e427e3e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 21 Jul 2011 09:42:45 +0900 Subject: slab: fix DEBUG_SLAB warning In commit c225150b "slab: fix DEBUG_SLAB build", "if ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))" is always true if ARCH_SLAB_MINALIGN == 0. Do not print warning if ARCH_SLAB_MINALIGN == 0. Signed-off-by: Tetsuo Handa Signed-off-by: Pekka Enberg --- mm/slab.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index c3cb3598555a..dc2f068c0b7d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3155,7 +3155,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); - if ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1)) { + if (ARCH_SLAB_MINALIGN && + ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } -- cgit v1.2.3 From ef3230880abd36553ab442363d3c9a0661f00769 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 23 Jul 2011 20:44:24 +0200 Subject: backing-dev: use synchronize_rcu_expedited instead of synchronize_rcu backing-dev: use synchronize_rcu_expedited instead of synchronize_rcu synchronize_rcu sleeps several timer ticks. synchronize_rcu_expedited is much faster. With 100Hz timer frequency, when we remove 10000 block devices with "dmsetup remove_all" command, it takes 27 minutes. With this patch, removing 10000 block devices takes only 15 seconds. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..2ef0dc9e7f39 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -505,7 +505,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); - synchronize_rcu(); + synchronize_rcu_expedited(); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, -- cgit v1.2.3 From bcff25fc8aa47a13faff8b4b992589813f7b450a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Jul 2011 13:31:25 -0600 Subject: mm: properly reflect task dirty limits in dirty_exceeded logic We set bdi->dirty_exceeded (and thus ratelimiting code starts to call balance_dirty_pages() every 8 pages) when a per-bdi limit is exceeded or global limit is exceeded. But per-bdi limit also depends on the task. Thus different tasks reach the limit on that bdi at different levels of dirty pages. The result is that with current code bdi->dirty_exceeded ping-ponged between 1 and 0 depending on which task just got into balance_dirty_pages(). We fix the issue by clearing bdi->dirty_exceeded only when per-bdi amount of dirty pages drops below the threshold (7/8 * bdi_dirty_limit) where task limits already do not have any influence. Impact: The end result is, the dirty pages are kept more tightly under control, with the average number slightly lowered than before. This reduces the risk to throttle light dirtiers and hence more responsive. However it may add overheads by enforcing balance_dirty_pages() calls on every 8 pages when there are 2+ heavy dirtiers. CC: Andrew Morton CC: Christoph Hellwig CC: Dave Chinner CC: Peter Zijlstra Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f9d9f5476d58..1d781803e629 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -281,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * effectively curb the growth of dirty pages. Light dirtiers with high enough * dirty threshold may never get throttled. */ +#define TASK_LIMIT_FRACTION 8 static unsigned long task_dirty_limit(struct task_struct *tsk, unsigned long bdi_dirty) { long numerator, denominator; unsigned long dirty = bdi_dirty; - u64 inv = dirty >> 3; + u64 inv = dirty / TASK_LIMIT_FRACTION; task_dirties_fraction(tsk, &numerator, &denominator); inv *= numerator; @@ -297,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk, return max(dirty, bdi_dirty/2); } +/* Minimum limit for any task */ +static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) +{ + return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; +} + /* * */ @@ -651,9 +658,12 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long task_bdi_thresh; + unsigned long min_task_bdi_thresh; unsigned long pages_written = 0; unsigned long pause = 1; bool dirty_exceeded = false; + bool clear_dirty_exceeded = true; struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long start_time = jiffies; @@ -673,7 +683,8 @@ static void balance_dirty_pages(struct address_space *mapping, break; bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - bdi_thresh = task_dirty_limit(current, bdi_thresh); + min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); + task_bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -685,7 +696,7 @@ static void balance_dirty_pages(struct address_space *mapping, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (bdi_thresh < 2*bdi_stat_error(bdi)) { + if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_dirty = bdi_nr_reclaimable + bdi_stat_sum(bdi, BDI_WRITEBACK); @@ -701,8 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping, * bdi or process from holding back light ones; The latter is * the last resort safeguard. */ - dirty_exceeded = (bdi_dirty > bdi_thresh) || + dirty_exceeded = (bdi_dirty > task_bdi_thresh) || (nr_dirty > dirty_thresh); + clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && + (nr_dirty <= dirty_thresh); if (!dirty_exceeded) break; @@ -723,7 +736,7 @@ static void balance_dirty_pages(struct address_space *mapping, * up. */ trace_balance_dirty_start(bdi); - if (bdi_nr_reclaimable > bdi_thresh) { + if (bdi_nr_reclaimable > task_bdi_thresh) { pages_written += writeback_inodes_wb(&bdi->wb, write_chunk); trace_balance_dirty_written(bdi, pages_written); @@ -766,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping, pause = HZ / 10; } - if (!dirty_exceeded && bdi->dirty_exceeded) + /* Clear dirty_exceeded flag only when no task can exceed the limit */ + if (clear_dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) -- cgit v1.2.3 From 50a15981a1fac7e019ff7c3cba87531fb580f065 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sun, 24 Jul 2011 10:47:59 +0200 Subject: [S390] reference bit testing for unmapped pages On x86 a page without a mapper is by definition not referenced / old. The s390 architecture keeps the reference bit in the storage key and the current code will check the storage key for page without a mapper. This leads to an interesting effect: the first time an s390 system needs to write pages to swap it only finds referenced pages. This causes a lot of pages to get added and written to the swap device. To avoid this behaviour change page_referenced to query the storage key only if there is a mapper of the page. Signed-off-by: Martin Schwidefsky --- mm/rmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 2540a39eea4a..9701574bb67a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -869,11 +869,11 @@ int page_referenced(struct page *page, vm_flags); if (we_locked) unlock_page(page); + + if (page_test_and_clear_young(page_to_pfn(page))) + referenced++; } out: - if (page_test_and_clear_young(page_to_pfn(page))) - referenced++; - return referenced; } -- cgit v1.2.3 From 9e577e8b46ab0c38970c0f0cd7eae62e6dffddee Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 22 Jul 2011 09:35:14 -0500 Subject: slub: When allocating a new slab also prep the first object We need to branch to the debug code for the first object if we allocate a new slab otherwise the first object will be marked wrongly as inactive. Tested-by: Rabin Vincent Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7836b45ea1fa..e842c19e67fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2082,6 +2082,9 @@ new_slab: stat(s, ALLOC_SLAB); c->node = page_to_nid(page); c->page = page; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) -- cgit v1.2.3 From 4e34e719e457f2e031297175410fc0bd4016a085 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 Jul 2011 17:37:31 +0200 Subject: fs: take the ACL checks to common code Replace the ->check_acl method with a ->get_acl method that simply reads an ACL from disk after having a cache miss. This means we can replace the ACL checking boilerplate code with a single implementation in namei.c. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 4 ++-- Documentation/filesystems/porting | 7 ++++--- Documentation/filesystems/vfs.txt | 2 +- fs/9p/acl.c | 14 +++----------- fs/9p/acl.h | 4 ++-- fs/9p/vfs_inode_dotl.c | 4 ++-- fs/btrfs/acl.c | 18 +----------------- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/inode.c | 10 +++++----- fs/ext2/acl.c | 19 +------------------ fs/ext2/acl.h | 4 ++-- fs/ext2/file.c | 2 +- fs/ext2/namei.c | 4 ++-- fs/ext3/acl.c | 19 +------------------ fs/ext3/acl.h | 4 ++-- fs/ext3/file.c | 2 +- fs/ext3/namei.c | 4 ++-- fs/ext4/acl.c | 19 +------------------ fs/ext4/acl.h | 4 ++-- fs/ext4/file.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/generic_acl.c | 14 -------------- fs/gfs2/acl.c | 25 ++----------------------- fs/gfs2/acl.h | 2 +- fs/gfs2/inode.c | 6 +++--- fs/jffs2/acl.c | 18 +----------------- fs/jffs2/acl.h | 4 ++-- fs/jffs2/dir.c | 2 +- fs/jffs2/file.c | 2 +- fs/jffs2/symlink.c | 2 +- fs/jfs/acl.c | 18 +----------------- fs/jfs/file.c | 2 +- fs/jfs/jfs_acl.h | 2 +- fs/jfs/namei.c | 2 +- fs/namei.c | 24 +++++++++++++----------- fs/ocfs2/acl.c | 24 ++++++------------------ fs/ocfs2/acl.h | 2 +- fs/ocfs2/file.c | 4 ++-- fs/ocfs2/namei.c | 2 +- fs/posix_acl.c | 1 - fs/reiserfs/file.c | 2 +- fs/reiserfs/namei.c | 6 +++--- fs/reiserfs/xattr.c | 18 ------------------ fs/xfs/linux-2.6/xfs_acl.c | 21 ++------------------- fs/xfs/linux-2.6/xfs_iops.c | 8 ++++---- fs/xfs/linux-2.6/xfs_trace.h | 2 +- fs/xfs/xfs_acl.h | 2 -- include/linux/fs.h | 2 +- include/linux/generic_acl.h | 1 - include/linux/reiserfs_acl.h | 6 +----- include/linux/reiserfs_xattr.h | 2 -- mm/shmem.c | 6 ------ 52 files changed, 92 insertions(+), 294 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index ca7e25292542..7e4699146fe1 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -52,7 +52,7 @@ ata *); void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, unsigned int); - int (*check_acl)(struct inode *, int); + int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -80,7 +80,7 @@ put_link: no truncate: yes (see below) setattr: yes permission: no (may not block if called in rcu-walk mode) -check_acl: no +get_acl: no getattr: no setxattr: yes getxattr: no diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 7f8861d341ea..b4a3d765ff9a 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -407,10 +407,11 @@ to some pointer to returning that pointer. On errors return ERR_PTR(...). -- [mandatory] - ->permission(), generic_permission() and ->check_acl() have lost flags + ->permission() and generic_permission()have lost flags argument; instead of passing IPERM_FLAG_RCU we add MAY_NOT_BLOCK into mask. - generic_permission() has also lost the check_acl argument; if you want -non-NULL to be used for that inode, put it into ->i_op->check_acl. + generic_permission() has also lost the check_acl argument; ACL checking +has been taken to VFS and filesystems need to provide a non-NULL ->i_op->get_acl +to read an ACL from disk. -- [mandatory] diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index eff6617c9a0f..52d8fb81cfff 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -356,7 +356,7 @@ struct inode_operations { void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); - int (*check_acl)(struct inode *, int); + int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 075bc909da17..814be079c185 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -96,7 +96,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -int v9fs_check_acl(struct inode *inode, int mask) +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) { struct posix_acl *acl; struct v9fs_session_info *v9ses; @@ -108,18 +108,10 @@ int v9fs_check_acl(struct inode *inode, int mask) * On access = client and acl = on mode get the acl * values from the server */ - return -EAGAIN; + return NULL; } - acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + return v9fs_get_cached_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; } static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 3eba10f3af1e..ddb7ae19d971 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,14 +16,14 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern int v9fs_check_acl(struct inode *inode, int mask); +extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, struct posix_acl **, struct posix_acl **); extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); #else -#define v9fs_check_acl NULL +#define v9fs_iop_get_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 803f59ff2faa..9d808d0e0cd9 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -872,7 +872,7 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { @@ -882,7 +882,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9508ad14c924..65a735d8f6e4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -30,7 +30,7 @@ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { int size; const char *name; @@ -195,22 +195,6 @@ out: return ret; } -int btrfs_check_acl(struct inode *inode, int mask) -{ - int error = -EAGAIN; - struct posix_acl *acl; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - - return error; -} - /* * btrfs_init_acl is already generally called under fs_mutex, so the locking * stuff has been fixed to work with that. If the locking stuff changes, we diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82be74efbb26..fe9287b06496 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2645,9 +2645,9 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask); +struct posix_acl *btrfs_get_acl(struct inode *inode, int type); #else -#define btrfs_check_acl NULL +#define btrfs_get_acl NULL #endif int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2548a04a0230..e91b097e7252 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7351,12 +7351,12 @@ static const struct inode_operations btrfs_dir_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct file_operations btrfs_dir_file_operations = { @@ -7425,7 +7425,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -7435,7 +7435,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, @@ -7447,7 +7447,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; const struct dentry_operations btrfs_dentry_operations = { diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ba99fa4b2f35..52c053763942 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -128,7 +128,7 @@ fail: /* * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext2_get_acl(struct inode *inode, int type) { int name_index; @@ -231,23 +231,6 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int -ext2_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3ff6cbb9ac44..5c0a6a4fb052 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_check_acl (struct inode *, int); +extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include -#define ext2_check_acl NULL +#define ext2_get_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 82e06321de35..a5b3a5db3120 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -102,6 +102,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index d60b7099e2db..761fde807fc9 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -408,7 +408,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -419,5 +419,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, }; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index a9fdd77d4b58..6c29bf0df04a 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -131,7 +131,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext3_get_acl(struct inode *inode, int type) { int name_index; @@ -239,23 +239,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -int -ext3_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 597334626de9..dbc921e458c5 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int); +extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include -#define ext3_check_acl NULL +#define ext3_get_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index f55df0e61cbd..2be5b99097f1 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -79,7 +79,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index c095cf5640c7..3b57230a17bb 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2529,7 +2529,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2540,5 +2540,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, }; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 7b094d1a8d3e..dca2d1ded931 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -131,7 +131,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext4_get_acl(struct inode *inode, int type) { int name_index; @@ -237,23 +237,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -int -ext4_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9d843d5deac4..18cb39ed7c7b 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int); +struct posix_acl *ext4_get_acl(struct inode *inode, int type); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include -#define ext4_check_acl NULL +#define ext4_get_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ce766f974b1d..e4095e988eba 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -301,7 +301,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 707d605bf769..8c9babac43dc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2590,7 +2590,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; @@ -2602,5 +2602,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, }; diff --git a/fs/generic_acl.c b/fs/generic_acl.c index ea19ca47d452..d5e33a077a67 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -172,20 +172,6 @@ generic_acl_chmod(struct inode *inode) return error; } -int -generic_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; -} - const struct xattr_handler generic_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index a2dd63c0c11a..884c9af0542f 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -67,30 +67,9 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) return acl; } -/** - * gfs2_check_acl - Check an ACL to see if we're allowed to do something - * @inode: the file we want to do something to - * @mask: what we want to do - * - * Returns: errno - */ - -int gfs2_check_acl(struct inode *inode, int mask) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) { - struct posix_acl *acl; - int error; - - acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; + return gfs2_acl_get(GFS2_I(inode), type); } static int gfs2_set_mode(struct inode *inode, mode_t mode) diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b522b0cb39ea..0da38dc7efec 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -16,7 +16,7 @@ #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES 25 -extern int gfs2_check_acl(struct inode *inode, int mask); +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern const struct xattr_handler gfs2_xattr_system_handler; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 0fb51a96eff0..900cf986aadc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1846,7 +1846,7 @@ const struct inode_operations gfs2_file_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1867,7 +1867,7 @@ const struct inode_operations gfs2_dir_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; const struct inode_operations gfs2_symlink_iops = { @@ -1882,6 +1882,6 @@ const struct inode_operations gfs2_symlink_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 5783ed81171b..27c511a1cf05 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -156,7 +156,7 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) return ERR_PTR(-EINVAL); } -static struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +struct posix_acl *jffs2_get_acl(struct inode *inode, int type) { struct posix_acl *acl; char *value = NULL; @@ -259,22 +259,6 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - int rc; - - acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - rc = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return rc; - } - return -EAGAIN; -} - int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) { struct posix_acl *acl; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9973073b9c47..b3421c78d9f8 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_check_acl(struct inode *, int); +struct posix_acl *jffs2_get_acl(struct inode *inode, int type); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); extern int jffs2_init_acl_post(struct inode *); @@ -36,7 +36,7 @@ extern const struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_check_acl (NULL) +#define jffs2_get_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 5f243cd63afc..9659b7c00468 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -56,7 +56,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 3989f7e09f7f..61e6723535b9 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -63,7 +63,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b955626071c2..e3035afb1814 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -20,7 +20,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 687a1ae42e3f..b3a32caf2b45 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -27,7 +27,7 @@ #include "jfs_xattr.h" #include "jfs_acl.h" -static struct posix_acl *jfs_get_acl(struct inode *inode, int type) +struct posix_acl *jfs_get_acl(struct inode *inode, int type) { struct posix_acl *acl; char *ea_name; @@ -114,22 +114,6 @@ out: return rc; } -int jfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7527855b5cc6..844f9460cb11 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -140,7 +140,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL - .check_acl = jfs_check_acl, + .get_acl = jfs_get_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 54e07559878d..ad84fe50ca9e 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_check_acl(struct inode *, int); +struct posix_acl *jfs_get_acl(struct inode *inode, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_acl_chmod(struct inode *inode); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 03787ef6a118..29b1f1a21142 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1537,7 +1537,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL - .check_acl = jfs_check_acl, + .get_acl = jfs_get_acl, #endif }; diff --git a/fs/namei.c b/fs/namei.c index 120efc76d3d0..ec2e5656b444 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -196,20 +196,22 @@ static int check_acl(struct inode *inode, int mask) acl = get_cached_acl(inode, ACL_TYPE_ACCESS); /* - * A filesystem can force a ACL callback by just never - * filling the ACL cache. But normally you'd fill the - * cache either at inode instantiation time, or on the - * first ->check_acl call. + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. * - * If the filesystem doesn't have a check_acl() function - * at all, we'll just create the negative cache entry. + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. */ if (acl == ACL_NOT_CACHED) { - if (inode->i_op->check_acl) - return inode->i_op->check_acl(inode, mask); - - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; + if (inode->i_op->get_acl) { + acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } else { + set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); + return -EAGAIN; + } } if (acl) { diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 480200e94e83..783c58d9daf1 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -290,7 +290,7 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_check_acl(struct inode *inode, int mask) +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; @@ -299,29 +299,17 @@ int ocfs2_check_acl(struct inode *inode, int mask) osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return ret; + return NULL; ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) { - mlog_errno(ret); - return ret; - } + if (ret < 0) + return ERR_PTR(ret); - acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + acl = ocfs2_get_acl_nolock(inode, type, di_bh); brelse(di_bh); - if (IS_ERR(acl)) { - mlog_errno(PTR_ERR(acl)); - return PTR_ERR(acl); - } - if (acl) { - ret = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return ret; - } - - return -EAGAIN; + return acl; } int ocfs2_acl_chmod(struct inode *inode) diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 5c5d31f05853..071fbd380f2f 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,7 +26,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -extern int ocfs2_check_acl(struct inode *, int); +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0fc2bd34039d..de4ea1af041b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2600,14 +2600,14 @@ const struct inode_operations ocfs2_file_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; /* diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 33889dc52dd7..53aa41ed7bf3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2498,5 +2498,5 @@ const struct inode_operations ocfs2_dir_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 0dd0266f9796..a6227d219e93 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -27,7 +27,6 @@ EXPORT_SYMBOL(posix_acl_alloc); EXPORT_SYMBOL(posix_acl_valid); EXPORT_SYMBOL(posix_acl_equiv_mode); EXPORT_SYMBOL(posix_acl_from_mode); -EXPORT_SYMBOL(posix_acl_permission); /* * Init a fresh posix_acl diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index c7156dc39ce7..ace635053a36 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -319,5 +319,5 @@ const struct inode_operations reiserfs_file_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 551f1b79dbc4..ef392324bbf1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1529,7 +1529,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; /* @@ -1546,7 +1546,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; @@ -1560,5 +1560,5 @@ const struct inode_operations reiserfs_special_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 7ba083eb62bd..6bc346c160e7 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -867,24 +867,6 @@ out: return err; } -int reiserfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - int error = -EAGAIN; /* do regular unix permission checks by default */ - - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - if (!IS_ERR(acl)) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } else if (PTR_ERR(acl) != -ENODATA) - error = PTR_ERR(acl); - } - - return error; -} - static int create_privroot(struct dentry *dentry) { int err; diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 2827bbd8366e..44ce51656804 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -114,6 +114,8 @@ xfs_get_acl(struct inode *inode, int type) if (acl != ACL_NOT_CACHED) return acl; + trace_xfs_get_acl(ip); + switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -218,25 +220,6 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int -xfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - int error = -EAGAIN; - - trace_xfs_check_acl(XFS_I(inode)); - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - - return error; -} - static int xfs_set_mode(struct inode *inode, mode_t mode) { diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 77463dd55198..6544c3236bc8 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -1022,7 +1022,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1048,7 +1048,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1073,7 +1073,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1086,7 +1086,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index fda0708ef2ea..690fc7a7bd72 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -571,7 +571,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL -DEFINE_INODE_EVENT(xfs_check_acl); +DEFINE_INODE_EVENT(xfs_get_acl); #endif DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 0135e2a669d7..2c656ef49473 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -42,7 +42,6 @@ struct xfs_acl { #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) #ifdef CONFIG_XFS_POSIX_ACL -extern int xfs_check_acl(struct inode *inode, int mask); extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); extern int xfs_acl_chmod(struct inode *inode); @@ -52,7 +51,6 @@ extern int posix_acl_default_exists(struct inode *inode); extern const struct xattr_handler xfs_xattr_acl_access_handler; extern const struct xattr_handler xfs_xattr_acl_default_handler; #else -# define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL # define xfs_inherit_acl(inode, default_acl) 0 # define xfs_acl_chmod(inode) 0 diff --git a/include/linux/fs.h b/include/linux/fs.h index 7a757a48a5c6..12f84b30c3ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1586,7 +1586,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); int (*permission) (struct inode *, int); - int (*check_acl)(struct inode *, int); + struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h index 574bea4013b6..b6d657544ef1 100644 --- a/include/linux/generic_acl.h +++ b/include/linux/generic_acl.h @@ -10,6 +10,5 @@ extern const struct xattr_handler generic_acl_default_handler; int generic_acl_init(struct inode *, struct inode *); int generic_acl_chmod(struct inode *); -int generic_check_acl(struct inode *inode, int mask); #endif /* LINUX_GENERIC_ACL_H */ diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h index 3fd8c4506bbb..f096b80e73d8 100644 --- a/include/linux/reiserfs_acl.h +++ b/include/linux/reiserfs_acl.h @@ -59,11 +59,7 @@ extern const struct xattr_handler reiserfs_posix_acl_access_handler; #else #define reiserfs_cache_default_acl(inode) 0 - -static inline struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) -{ - return NULL; -} +#define reiserfs_get_acl NULL static inline int reiserfs_acl_chmod(struct inode *inode) { diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index 57958c0e1d38..c2b71473266e 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h @@ -45,7 +45,6 @@ int reiserfs_permission(struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -int reiserfs_check_acl(struct inode *inode, int mask); ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); int reiserfs_setxattr(struct dentry *dentry, const char *name, @@ -123,7 +122,6 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode) #define reiserfs_setxattr NULL #define reiserfs_listxattr NULL #define reiserfs_removexattr NULL -#define reiserfs_check_acl NULL static inline void reiserfs_init_xattr_rwsem(struct inode *inode) { diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb7..3e519798b522 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2715,10 +2715,6 @@ static const struct inode_operations shmem_inode_operations = { .listxattr = shmem_listxattr, .removexattr = shmem_removexattr, #endif -#ifdef CONFIG_TMPFS_POSIX_ACL - .check_acl = generic_check_acl, -#endif - }; static const struct inode_operations shmem_dir_inode_operations = { @@ -2741,7 +2737,6 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, - .check_acl = generic_check_acl, #endif }; @@ -2754,7 +2749,6 @@ static const struct inode_operations shmem_special_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, - .check_acl = generic_check_acl, #endif }; -- cgit v1.2.3 From ee8f248d266ec6966c0ce6b7dec24de43dcc1b58 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Mon, 25 Jul 2011 17:11:50 -0700 Subject: hugetlb: add phys addr to struct huge_bootmem_page This is needed on HIGHMEM systems - we don't always have a virtual address so store the physical address and map it in as needed. [akpm@linux-foundation.org: cleanup] Signed-off-by: Becky Bruce Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 59225ef27d15..19644e0016bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -231,6 +231,9 @@ struct hstate { struct huge_bootmem_page { struct list_head list; struct hstate *hstate; +#ifdef CONFIG_HIGHMEM + phys_addr_t phys; +#endif }; struct page *alloc_huge_page_node(struct hstate *h, int nid); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153bc829..c6d342d313c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { - struct page *page = virt_to_page(m); struct hstate *h = m->hstate; + struct page *page; + +#ifdef CONFIG_HIGHMEM + page = pfn_to_page(m->phys >> PAGE_SHIFT); + free_bootmem_late((unsigned long)m, + sizeof(struct huge_bootmem_page)); +#else + page = virt_to_page(m); +#endif __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); -- cgit v1.2.3 From 33dd4e0ec91138c3d80e790c08a3db47426c81f2 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 25 Jul 2011 17:11:51 -0700 Subject: mm: make some struct page's const These uses are read-only and in a subsequent patch I have a const struct page in my hand... [akpm@linux-foundation.org: fix warnings in lowmem_page_address()] Signed-off-by: Ian Campbell Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++------ mm/sparse.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a45ad22a170..6321e840e21d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -637,7 +637,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) -static inline enum zone_type page_zonenum(struct page *page) +static inline enum zone_type page_zonenum(const struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } @@ -665,15 +665,15 @@ static inline int zone_to_nid(struct zone *zone) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(struct page *page); +extern int page_to_nid(const struct page *page); #else -static inline int page_to_nid(struct page *page) +static inline int page_to_nid(const struct page *page) { return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif -static inline struct zone *page_zone(struct page *page) +static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } @@ -718,9 +718,9 @@ static inline void set_page_links(struct page *page, enum zone_type zone, */ #include -static __always_inline void *lowmem_page_address(struct page *page) +static __always_inline void *lowmem_page_address(const struct page *page) { - return __va(PFN_PHYS(page_to_pfn(page))); + return __va(PFN_PHYS(page_to_pfn((struct page *)page))); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12831a2..858e1dff9b2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(struct page *page) +int page_to_nid(const struct page *page) { return section_to_node_table[page_to_section(page)]; } -- cgit v1.2.3 From ccb6108f5b0b541d3eb332c3a73e645c0f84278e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jul 2011 17:11:57 -0700 Subject: mm/backing-dev.c: reset bdi min_ratio in bdi_unregister() Vito said: : The system has many usb disks coming and going day to day, with their : respective bdi's having min_ratio set to 1 when inserted. It works for : some time until eventually min_ratio can no longer be set, even when the : active set of bdi's seen in /sys/class/bdi/*/min_ratio doesn't add up to : anywhere near 100. : : This then leads to an unrelated starvation problem caused by write-heavy : fuse mounts being used atop the usb disks, a problem the min_ratio setting : at the underlying devices bdi effectively prevents. Fix this leakage by resetting the bdi min_ratio when unregistering the BDI. Signed-off-by: Peter Zijlstra Reported-by: Vito Caputo Cc: Wu Fengguang Cc: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..e56fe35cef05 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -606,6 +606,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); del_timer_sync(&bdi->wb.wakeup_timer); -- cgit v1.2.3 From 9d0ad8ca43ce8023bb834a409c2258bd7197fb05 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 25 Jul 2011 17:12:05 -0700 Subject: mm: extend memory hotplug API to allow memory hotplug in virtual machines This patch contains online_page_callback and apropriate functions for registering/unregistering online page callbacks. It allows to do some machine specific tasks during online page stage which is required to implement memory hotplug in virtual machines. Currently this patch is required by latest memory hotplug support for Xen balloon driver patch which will be posted soon. Additionally, originial online_page() function was splited into following functions doing "atomic" operations: - __online_page_set_limits() - set new limits for memory management code, - __online_page_increment_counters() - increment totalram_pages and totalhigh_pages, - __online_page_free() - free page to allocator. It was done to: - not duplicate existing code, - ease hotplug code devolpment by usage of well defined interface, - avoid stupid bugs which are unavoidable when the same code (by design) is developed in many places. [akpm@linux-foundation.org: use explicit indirect-call syntax] Signed-off-by: Daniel Kiper Reviewed-by: Konrad Rzeszutek Wilk Cc: Ian Campbell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 11 +++++-- mm/memory_hotplug.c | 68 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8122018d3000..0b8e2a742600 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -68,12 +68,19 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -/* need some defines for these for archs that don't support it */ -extern void online_page(struct page *page); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long, unsigned long); extern void __offline_isolated_pages(unsigned long, unsigned long); +typedef void (*online_page_callback_t)(struct page *page); + +extern int set_online_page_callback(online_page_callback_t callback); +extern int restore_online_page_callback(online_page_callback_t callback); + +extern void __online_page_set_limits(struct page *page); +extern void __online_page_increment_counters(struct page *page); +extern void __online_page_free(struct page *page); + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11e..6e7d8b21dbfa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,17 @@ #include "internal.h" +/* + * online_page_callback contains pointer to current page onlining function. + * Initially it is generic_online_page(). If it is required it could be + * changed by calling set_online_page_callback() for callback registration + * and restore_online_page_callback() for generic callback restore. + */ + +static void generic_online_page(struct page *page); + +static online_page_callback_t online_page_callback = generic_online_page; + DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) @@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__remove_pages); -void online_page(struct page *page) +int set_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == generic_online_page) { + online_page_callback = callback; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(set_online_page_callback); + +int restore_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == callback) { + online_page_callback = generic_online_page; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(restore_online_page_callback); + +void __online_page_set_limits(struct page *page) { unsigned long pfn = page_to_pfn(page); - totalram_pages++; if (pfn >= num_physpages) num_physpages = pfn + 1; +} +EXPORT_SYMBOL_GPL(__online_page_set_limits); + +void __online_page_increment_counters(struct page *page) +{ + totalram_pages++; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) totalhigh_pages++; #endif +} +EXPORT_SYMBOL_GPL(__online_page_increment_counters); +void __online_page_free(struct page *page) +{ ClearPageReserved(page); init_page_count(page); __free_page(page); } +EXPORT_SYMBOL_GPL(__online_page_free); + +static void generic_online_page(struct page *page) +{ + __online_page_set_limits(page); + __online_page_increment_counters(page); + __online_page_free(page); +} static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) @@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); - online_page(page); + (*online_page_callback)(page); onlined_pages++; } *(unsigned long *)arg = onlined_pages; -- cgit v1.2.3 From e21c7ffd6f7493aa01bccd17ebc13dbdfecce880 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:06 -0700 Subject: mm: swap-token: fix dead link http://www.cs.wm.edu/~sjiang/token.pdf is now dead. Replace it with an alive alternative. Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/thrash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/thrash.c b/mm/thrash.c index fabf2d0f5169..d416d403e140 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -6,7 +6,7 @@ * Released under the GPL, see the file COPYING for details. * * Simple token based thrashing protection, using the algorithm - * described in: http://www.cs.wm.edu/~sjiang/token.pdf + * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html * * Sep 2006, Ashwin Chaugule * Improved algorithm to pass token: -- cgit v1.2.3 From 53bb01f593d50188c8d638f89db96f9b6b042bcd Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:07 -0700 Subject: mm: swap-token: makes global variables to function local global_faults and last_aging are only used in grab_swap_token(). Move them into grab_swap_token(). Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/thrash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/thrash.c b/mm/thrash.c index d416d403e140..42ffb0179271 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -30,8 +30,6 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; struct mem_cgroup *swap_token_memcg; -static unsigned int global_faults; -static unsigned int last_aging; #ifdef CONFIG_CGROUP_MEM_RES_CTLR static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) @@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm) { int current_interval; unsigned int old_prio = mm->token_priority; + static unsigned int global_faults; + static unsigned int last_aging; global_faults++; -- cgit v1.2.3 From 45ebb840257b060ec54416aebffd9747e210962c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:08 -0700 Subject: mm: swap-token: add a comment for priority aging Document some swap token aging design decisions. Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/thrash.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/thrash.c b/mm/thrash.c index 42ffb0179271..e53f7d02c17c 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm) if (!swap_token_mm) goto replace_token; + /* + * Usually, we don't need priority aging because long interval faults + * makes priority decrease quickly. But there is one exception. If the + * token owner task is sleeping, it never make long interval faults. + * Thus, we need a priority aging mechanism instead. The requirements + * of priority aging are + * 1) An aging interval is reasonable enough long. Too short aging + * interval makes quick swap token lost and decrease performance. + * 2) The swap token owner task have to get priority aging even if + * it's under sleep. + */ if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { swap_token_mm->token_priority /= 2; last_aging = global_faults; -- cgit v1.2.3 From 4b6ddbf7ed4ef2f40e0a27418146eedaa68953c6 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:09 -0700 Subject: pagewalk: fix walk_page_range() don't check find_vma() result properly The doc of find_vma() says, /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { (snip) Thus, caller should confirm whether the returned vma matches a desired one. Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d533611..606bbb4125d0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -176,7 +176,7 @@ int walk_page_range(unsigned long addr, unsigned long end, * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) { + if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; /* -- cgit v1.2.3 From 6c6d5280431544e4036886ea74e3334a98bc5f96 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:09 -0700 Subject: pagewalk: don't look up vma if walk->hugetlb_entry is unused Currently, walk_page_range() calls find_vma() every page table for walk iteration. but it's completely unnecessary if walk->hugetlb_entry is unused. And we don't have to assume find_vma() is a lightweight operation. So this patch checks the walk->hugetlb_entry and avoids the find_vma() call if possible. This patch also makes some cleanups. 1) remove ugly uninitialized_var() and 2) #ifdef in function body. Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 606bbb4125d0..ee4ff87c58c1 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -#endif + +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + struct vm_area_struct *vma; + + /* We don't need vma lookup at all. */ + if (!walk->hugetlb_entry) + return NULL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + vma = find_vma(walk->mm, addr); + if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) + return vma; + + return NULL; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + return NULL; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + + /** * walk_page_range - walk a memory map's page tables with a callback @@ -165,18 +197,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *uninitialized_var(vma); + struct vm_area_struct *vma; next = pgd_addr_end(addr, end); -#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) { + vma = hugetlb_vma(addr, walk); + if (vma) { if (vma->vm_end < next) next = vma->vm_end; /* @@ -189,7 +220,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, next); continue; } -#endif + if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); -- cgit v1.2.3 From c27fe4c8942d3ca715986f79cc26f44608d7d9fb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:10 -0700 Subject: pagewalk: add locking-rule comments Originally, walk_hugetlb_range() didn't require a caller take any lock. But commit d33b9f45bd ("mm: hugetlb: fix hugepage memory leak in walk_page_range") changed its rule. Because it added find_vma() call in walk_hugetlb_range(). Any locking-rule change commit should write a doc too. [akpm@linux-foundation.org: clarify comment] Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/pagewalk.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6321e840e21d..3c5505a3bc49 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -911,6 +911,8 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, * @pte_entry: if set, called for each non-empty PTE (4th-level) entry * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry + * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry + * is used. * * (see walk_page_range for more details) */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ee4ff87c58c1..f7929406e776 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -181,6 +181,9 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, * * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. + * + * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry + * is !NULL. */ int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk) -- cgit v1.2.3 From dd78553b5e7a0b34c0b60478d04ee16d8d8f4fa7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:11 -0700 Subject: pagewalk: fix code comment for THP Commit bae9c19bf1 ("thp: split_huge_page_mm/vma") changed locking behavior of walk_page_range(). Thus this patch changes the comment too. Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index f7929406e776..2f5cf10ff660 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -176,7 +176,8 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, * associated range, and a copy of the original mm_walk for access to * the ->private or ->mm fields. * - * No locks are taken, but the bottom level iterator will map PTE + * Usually no locks are taken, but splitting transparent huge page may + * take page table lock. And the bottom level iterator will map PTE * directories from highmem if necessary. * * If any callback returns a non-zero value, the walk is aborted and -- cgit v1.2.3 From 00a66d2974485d7d95d61d5772142b2a2231ed2a Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Jul 2011 17:12:12 -0700 Subject: mm: remove the leftovers of noswapaccount In commit a2c8990aed5ab ("memsw: remove noswapaccount kernel parameter"), Michal forgot to remove some left pieces of noswapaccount in the tree, this patch removes them all. Signed-off-by: WANG Cong Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 16 ---------------- init/Kconfig | 4 ++-- mm/page_cgroup.c | 2 +- 3 files changed, 3 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index d59e71df5c5c..f9d240dfac06 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -518,22 +518,6 @@ Files: net/netfilter/xt_connlimit.c ---------------------------- -What: noswapaccount kernel command line parameter -When: 2.6.40 -Why: The original implementation of memsw feature enabled by - CONFIG_CGROUP_MEM_RES_CTLR_SWAP could be disabled by the noswapaccount - kernel parameter (introduced in 2.6.29-rc1). Later on, this decision - turned out to be not ideal because we cannot have the feature compiled - in and disabled by default and let only interested to enable it - (e.g. general distribution kernels might need it). Therefore we have - added swapaccount[=0|1] parameter (introduced in 2.6.37) which provides - the both possibilities. If we remove noswapaccount we will have - less command line parameters with the same functionality and we - can also cleanup the parameter handling a bit (). -Who: Michal Hocko - ----------------------------- - What: ipt_addrtype match include file When: 2012 Why: superseded by xt_addrtype diff --git a/init/Kconfig b/init/Kconfig index e20aa3112240..d62778390e55 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -673,7 +673,7 @@ config CGROUP_MEM_RES_CTLR_SWAP be careful about enabling this. When memory resource controller is disabled by boot option, this will be automatically disabled and there will be no overhead from this. Even when you set this config=y, - if boot option "noswapaccount" is set, swap will not be accounted. + if boot option "swapaccount=0" is set, swap will not be accounted. Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. config CGROUP_MEM_RES_CTLR_SWAP_ENABLED @@ -688,7 +688,7 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED parameter should have this option unselected. For those who want to have the feature enabled by default should select this option (if, for some reason, they need to disable it - then noswapaccount does the trick). + then swapaccount=0 does the trick). config CGROUP_PERF bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 53bffc6c293e..9cb1c44ffc37 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option\n"); + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } -- cgit v1.2.3 From 1bb36fbd4d58ec3fab4dab5ed39a2af492c263ea Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 25 Jul 2011 17:12:13 -0700 Subject: mm/page_cgroup.c: simplify code by using SECTION_ALIGN_UP() and SECTION_ALIGN_DOWN() macros Commit a539f3533b78e3 ("mm: add SECTION_ALIGN_UP() and SECTION_ALIGN_DOWN() macro") introduced the SECTION_ALIGN_UP() and SECTION_ALIGN_DOWN() macros. Use those macros to increase code readability. Signed-off-by: Daniel Kiper Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 9cb1c44ffc37..39d216d535ea 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == -1) { /* @@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_cgroup(pfn); -- cgit v1.2.3 From d788e80a8c83ecdbdd55b6e985cced9cfe3a815b Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 25 Jul 2011 17:12:14 -0700 Subject: mm/huge_memory.c: minor lock simplification in __khugepaged_exit The lock is released first thing in all three branches. Simplify this by unconditionally releasing lock and remove else clause which was only there to be sure lock was released. Signed-off-by: Chris Wright Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f297fd2..e2d1587be269 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm) list_del(&mm_slot->mm_node); free = 1; } + spin_unlock(&khugepaged_mm_lock); if (free) { - spin_unlock(&khugepaged_mm_lock); clear_bit(MMF_VM_HUGEPAGE, &mm->flags); free_mm_slot(mm_slot); mmdrop(mm); } else if (mm_slot) { - spin_unlock(&khugepaged_mm_lock); /* * This is required to serialize against * khugepaged_test_exit() (which is guaranteed to run @@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm) */ down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); - } else - spin_unlock(&khugepaged_mm_lock); + } } static void release_pte_page(struct page *page) -- cgit v1.2.3 From 32f84528fbb5177275193a3311be8756f0cbd62c Mon Sep 17 00:00:00 2001 From: Chris Forbes Date: Mon, 25 Jul 2011 17:12:14 -0700 Subject: mm: hugetlb: fix coding style issues Fix coding style issues flagged by checkpatch.pl Signed-off-by: Chris Forbes Acked-by: Eric B Munson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c6d342d313c7..dae27ba3be2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include @@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); * must either hold the mmap_sem for write, or the mmap_sem for read and * the hugetlb_instantiation mutex: * - * down_write(&mm->mmap_sem); + * down_write(&mm->mmap_sem); * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1 << PG_writeback); } set_compound_page_dtor(page, NULL); set_page_refcounted(page); @@ -591,7 +592,6 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } - EXPORT_SYMBOL_GPL(PageHuge); static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) @@ -2132,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); - } } @@ -2189,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) { + if (non_swap_entry(swp) && is_migration_entry(swp)) return 1; - } else + else return 0; } @@ -2202,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) return 1; - } else + else return 0; } @@ -2567,7 +2566,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON | VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } @@ -2635,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, (pmd_t *)ptep, address); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | + return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(h - hstates); } -- cgit v1.2.3 From 6ac47520063b230641a64062b8a229201cd0a3a8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Jul 2011 17:12:16 -0700 Subject: mm/memory.c: remove ZAP_BLOCK_SIZE ZAP_BLOCK_SIZE became unused in the preemptible-mmu_gather work ("mm: Remove i_mmap_lock lockbreak"). So zap it. Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9b8a01d941cb..a58bbebb3070 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * * Unmap all pages in the vma list. * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. -- cgit v1.2.3 From 11239836c04b50ba8453ec58ca7a7bd716ef02c1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 25 Jul 2011 17:12:17 -0700 Subject: oom: remove references to old badness() function The badness() function in the oom killer was renamed to oom_badness() in a63d83f427fb ("oom: badness heuristic rewrite") since it is a globally exported function for clarity. The prototype for the old function still existed in linux/oom.h, so remove it. There are no existing users. Also fixes documentation and comment references to badness() and adjusts them accordingly. Signed-off-by: David Rientjes Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/obsolete/proc-pid-oom_adj | 2 +- Documentation/feature-removal-schedule.txt | 2 +- include/linux/oom.h | 4 ---- mm/oom_kill.c | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj index cf63f264ce0f..9a3cb88ade47 100644 --- a/Documentation/ABI/obsolete/proc-pid-oom_adj +++ b/Documentation/ABI/obsolete/proc-pid-oom_adj @@ -14,7 +14,7 @@ Why: /proc//oom_adj allows userspace to influence the oom killer's A much more powerful interface, /proc//oom_score_adj, was introduced with the oom killer rewrite that allows users to increase or - decrease the badness() score linearly. This interface will replace + decrease the badness score linearly. This interface will replace /proc//oom_adj. A warning will be emitted to the kernel log if an application uses this diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f9d240dfac06..d093e550dbeb 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -184,7 +184,7 @@ Why: /proc//oom_adj allows userspace to influence the oom killer's A much more powerful interface, /proc//oom_score_adj, was introduced with the oom killer rewrite that allows users to increase or - decrease the badness() score linearly. This interface will replace + decrease the badness score linearly. This interface will replace /proc//oom_adj. A warning will be emitted to the kernel log if an application uses this diff --git a/include/linux/oom.h b/include/linux/oom.h index 4952fb874ad3..13b7b02e599a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -64,10 +64,6 @@ static inline void oom_killer_enable(void) oom_killer_disabled = false; } -/* The badness from the OOM killer */ -extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, - const nodemask_t *nodemask, unsigned long uptime); - extern struct task_struct *find_lock_task_mm(struct task_struct *p); /* sysctls */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0be989d4365..eafff89b3dd6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -487,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If any of p's children has a different mm and is eligible for kill, - * the one with the highest badness() score is sacrificed for its + * the one with the highest oom_badness() score is sacrificed for its * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ -- cgit v1.2.3 From c9d8c3d0896bfa5b57531ecc41a85ffbc6d87dbe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Jul 2011 17:12:18 -0700 Subject: mm/memblock.c: avoid abuse of RED_INACTIVE RED_INACTIVE is a slab thing, and reusing it for memblock was inappropriate, because memblock is dealing with phys_addr_t's which have a Kconfigurable sizeof(). Create a new poison type for this application. Fixes the sparse warning warning: cast truncates bits from constant value (9f911029d74e35b becomes 9d74e35b) Reported-by: H Hartley Sweeten Tested-by: H Hartley Sweeten Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 6 ++++++ mm/memblock.c | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/poison.h b/include/linux/poison.h index 2110a81c5e2a..79159de0e341 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -40,6 +40,12 @@ #define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ #define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT +#define MEMBLOCK_INACTIVE 0x3a84fb0144c9e71bULL +#else +#define MEMBLOCK_INACTIVE 0x44c9e71bUL +#endif + #define SLUB_RED_INACTIVE 0xbb #define SLUB_RED_ACTIVE 0xcc diff --git a/mm/memblock.c b/mm/memblock.c index a0562d1a6ad4..ccbf97339592 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -758,9 +758,9 @@ void __init memblock_analyze(void) /* Check marker in the unused last array entry */ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); memblock.memory_size = 0; @@ -786,8 +786,8 @@ void __init memblock_init(void) memblock.reserved.max = INIT_MEMBLOCK_REGIONS; /* Write a marker in the unused last array entry */ - memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; - memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; /* Create a dummy zero size MEMBLOCK which will get coalesced away later. * This simplifies the memblock_add() code below... -- cgit v1.2.3 From c15bef3099c346f2124367bff46954b59e13c3ee Mon Sep 17 00:00:00 2001 From: Dmitry Fink Date: Mon, 25 Jul 2011 17:12:19 -0700 Subject: mmap: fix and tidy up overcommit page arithmetic - shmem pages are not immediately available, but they are not potentially available either, even if we swap them out, they will just relocate from memory into swap, total amount of immediate and potentially available memory is not going to be affected, so we shouldn't count them as potentially free in the first place. - nr_free_pages() is not an expensive operation anymore, there is no need to split the decision making in two halves and repeat code. Signed-off-by: Dmitry Fink Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 34 +++++++++++++--------------------- mm/nommu.c | 34 +++++++++++++--------------------- 2 files changed, 26 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d49736ff8a8d..a65efd4db3e1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -135,35 +143,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free += global_page_state(NR_SLAB_RECLAIMABLE); - /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; diff --git a/mm/nommu.c b/mm/nommu.c index 5c5c2d4b1807..4358032566e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1884,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -1897,35 +1905,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free += global_page_state(NR_SLAB_RECLAIMABLE); - /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; -- cgit v1.2.3 From 5e5358e7cf48aa079b8761a7d806ad536023745c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:23 -0700 Subject: mm: cleanup descriptions of filler arg The often-NULL data arg to read_cache_page() and read_mapping_page() functions is misdescribed as "destination for read data": no, it's the first arg to the filler function, often struct file * to ->readpage(). Satisfy checkpatch.pl on those filler prototypes, and tidy up the declarations in linux/pagemap.h. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 +++++------- mm/filemap.c | 12 ++++++------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8e38d4c140ff..cfaaa6949b8b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -255,26 +255,24 @@ static inline struct page *grab_cache_page(struct address_space *mapping, extern struct page * grab_cache_page_nowait(struct address_space *mapping, pgoff_t index); extern struct page * read_cache_page_async(struct address_space *mapping, - pgoff_t index, filler_t *filler, - void *data); + pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, - void *data); + pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); static inline struct page *read_mapping_page_async( - struct address_space *mapping, - pgoff_t index, void *data) + struct address_space *mapping, + pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page_async(mapping, index, filler, data); } static inline struct page *read_mapping_page(struct address_space *mapping, - pgoff_t index, void *data) + pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page(mapping, index, filler, data); diff --git a/mm/filemap.c b/mm/filemap.c index f820e600f1ad..2780be4bd493 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1792,7 +1792,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) { @@ -1823,7 +1823,7 @@ repeat: static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) @@ -1863,7 +1863,7 @@ out: * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. @@ -1875,7 +1875,7 @@ out: */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -1923,7 +1923,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Read into the page cache. If a page already exists, and PageUptodate() is * not set, try to fill the page then wait for it to become unlocked. @@ -1932,7 +1932,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -- cgit v1.2.3 From 8a549bea51138be2126a2cc6aabe8f17ef66b79b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:24 -0700 Subject: mm: tidy vmtruncate_range and related functions Use consistent variable names in truncate_pagecache(), truncate_setsize(), vmtruncate() and vmtruncate_range(). unmap_mapping_range() and vmtruncate_range() have mismatched interfaces: don't change either, but make the vmtruncates more precise about what they expect unmap_mapping_range() to do. vmtruncate_range() is currently called only with page-aligned start and end+1: can handle unaligned start, but unaligned end+1 would hit BUG_ON in truncate_inode_pages_range() (lacks partial clearing of the end page). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 003c6c685fc8..c924764e2ce5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -531,8 +531,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @old: old file offset - * @new: new file offset + * @oldsize: old file size + * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. @@ -544,9 +544,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for @@ -557,9 +558,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -589,29 +590,31 @@ EXPORT_SYMBOL(truncate_setsize); /** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used - * @offset: file offset to start truncating + * @newsize: file offset to start truncating * * This function is deprecated and truncate_setsize or truncate_pagecache * should be used instead, together with filesystem specific block truncation. */ -int vmtruncate(struct inode *inode, loff_t offset) +int vmtruncate(struct inode *inode, loff_t newsize) { int error; - error = inode_newsize_ok(inode, offset); + error = inode_newsize_ok(inode, newsize); if (error) return error; - truncate_setsize(inode, offset); + truncate_setsize(inode, newsize); if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(lstart, PAGE_SIZE); + loff_t holelen = 1 + lend - holebegin; /* * If the underlying filesystem is not going to provide @@ -623,10 +626,10 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) mutex_lock(&inode->i_mutex); inode_dio_wait(inode); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); + unmap_mapping_range(mapping, holebegin, holelen, 1); + inode->i_op->truncate_range(inode, lstart, lend); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, offset, (end - offset), 1); + unmap_mapping_range(mapping, holebegin, holelen, 1); mutex_unlock(&inode->i_mutex); return 0; -- cgit v1.2.3 From b85e0effd3dcbf9118b896232f59526ab1a39a74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: consistent truncate and invalidate loops Make the pagevec_lookup loops in truncate_inode_pages_range(), invalidate_mapping_pages() and invalidate_inode_pages2_range() more consistent with each other. They were relying upon page->index of an unlocked page, but apologizing for it: accept it, embrace it, add comments and WARN_ONs, and simplify the index handling. invalidate_inode_pages2_range() had special handling for a wrapped page->index + 1 = 0 case; but MAX_LFS_FILESIZE doesn't let us anywhere near there, and a corrupt page->index in the radix_tree could cause more trouble than that would catch. Remove that wrapped handling. invalidate_inode_pages2_range() uses min() to limit the pagevec_lookup when near the end of the range: copy that into the other two, although it's less useful than you might think (it limits the use of the buffer, rather than the indices looked up). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 ++ mm/truncate.c | 110 +++++++++++++++++++++++++--------------------------------- 2 files changed, 49 insertions(+), 63 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2780be4bd493..10a171113273 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -128,6 +128,7 @@ void __delete_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) @@ -483,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); diff --git a/mm/truncate.c b/mm/truncate.c index c924764e2ce5..dc459014f777 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) * The first pass will remove most pages, so the search cost of the second pass * is low. * - * When looking at page->index outside the page lock we need to be careful to - * copy it into a local to avoid races (it could change at any time). - * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. @@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; - pgoff_t next; + pgoff_t index; + pgoff_t end; int i; cleancache_flush_inode(mapping); @@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, end = (lend >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); - next = start; - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - if (page_index > end) { - next = page_index; + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; - } - if (page_index > next) - next = page_index; - next++; if (!trylock_page(page)) continue; + WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } if (partial) { @@ -264,13 +259,14 @@ void truncate_inode_pages_range(struct address_space *mapping, } } - next = start; + index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) + if (!pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (index == start) break; - next = start; + index = start; continue; } if (pvec.pages[0]->index > end) { @@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - if (page->index > end) + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; + lock_page(page); + WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); - if (page->index > next) - next = page->index; - next++; unlock_page(page); } pagevec_release(&pvec); mem_cgroup_uncharge_end(); + index++; } cleancache_flush_inode(mapping); } @@ -333,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = start; + pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t index; - int lock_failed; - - lock_failed = !trylock_page(page); - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ + /* We rely upon deletion not changing page->index */ index = page->index; - if (index > next) - next = index; - next++; - if (lock_failed) - continue; + if (index > end) + break; + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* @@ -371,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!ret) deactivate_page(page); count += ret; - if (next > end) - break; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } return count; } @@ -442,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next; + pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; - int wrapped = 0; cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); - next = start; - while (next <= end && !wrapped && - pagevec_lookup(&pvec, mapping, next, - min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index; + + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) + break; lock_page(page); + WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } - page_index = page->index; - next = page_index + 1; - if (next == 0) - wrapped = 1; - if (page_index > end) { - unlock_page(page); - break; - } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { @@ -480,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)page_index< Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: pincer in truncate_inode_pages_range truncate_inode_pages_range()'s final loop has a nice pincer property, bringing start and end together, squeezing out the last pages. But the range handling missed out on that, just sliding up the range, perhaps letting pages come in behind it. Add one more test to give it the same pincer effect. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index dc459014f777..232eb2736a79 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -269,7 +269,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; continue; } - if (pvec.pages[0]->index > end) { + if (index == start && pvec.pages[0]->index > end) { pagevec_release(&pvec); break; } -- cgit v1.2.3 From d515afe88a32e567c550e3db914f3e378f86453a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:26 -0700 Subject: tmpfs: no need to use i_lock 2.6.36's 7e496299d4d2 ("tmpfs: make tmpfs scalable with percpu_counter for used blocks") to make tmpfs scalable with percpu_counter used inode->i_lock in place of sbinfo->stat_lock around i_blocks updates; but that was adverse to scalability, and unnecessary, since info->lock is already held there in the fast paths. Remove those uses of i_lock, and add info->lock in the three error paths where it's then needed across shmem_free_blocks(). It's not actually needed across shmem_unacct_blocks(), but they're so often paired that it looks wrong to split them apart. Signed-off-by: Hugh Dickins Acked-by: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb7..c1db11cf220d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -241,9 +241,7 @@ static void shmem_free_blocks(struct inode *inode, long pages) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { percpu_counter_add(&sbinfo->used_blocks, -pages); - spin_lock(&inode->i_lock); inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } } @@ -432,9 +430,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long sbinfo->max_blocks - 1) >= 0) return ERR_PTR(-ENOSPC); percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } spin_unlock(&info->lock); @@ -1421,9 +1417,7 @@ repeat: shmem_acct_block(info->flags)) goto nospace; percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } else if (shmem_acct_block(info->flags)) goto nospace; @@ -1434,8 +1428,10 @@ repeat: spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { + spin_lock(&info->lock); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); error = -ENOMEM; goto failed; } @@ -1449,8 +1445,10 @@ repeat: current->mm, GFP_KERNEL); if (error) { page_cache_release(filepage); + spin_lock(&info->lock); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); filepage = NULL; goto failed; } @@ -1480,10 +1478,10 @@ repeat: * be done automatically. */ if (ret) { - spin_unlock(&info->lock); - page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); + page_cache_release(filepage); filepage = NULL; if (error) goto failed; -- cgit v1.2.3 From 1d65f86db14806cf7b1218c7b4ecb8b4db5af27d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 25 Jul 2011 17:12:27 -0700 Subject: mm: preallocate page before lock_page() at filemap COW Currently we are keeping faulted page locked throughout whole __do_fault call (except for page_mkwrite code path) after calling file system's fault code. If we do early COW, we allocate a new page which has to be charged for a memcg (mem_cgroup_newpage_charge). This function, however, might block for unbounded amount of time if memcg oom killer is disabled or fork-bomb is running because the only way out of the OOM situation is either an external event or OOM-situation fix. In the end we are keeping the faulted page locked and blocking other processes from faulting it in which is not good at all because we are basically punishing potentially an unrelated process for OOM condition in a different group (I have seen stuck system because of ld-2.11.1.so being locked). We can do test easily. % cgcreate -g memory:A % cgset -r memory.limit_in_bytes=64M A % cgset -r memory.memsw.limit_in_bytes=64M A % cd kernel_dir; cgexec -g memory:A make -j Then, the whole system will live-locked until you kill 'make -j' by hands (or push reboot...) This is because some important page in a a shared library are locked. Considering again, the new page is not necessary to be allocated with lock_page() held. And usual page allocation may dive into long memory reclaim loop with holding lock_page() and can cause very long latency. There are 3 ways. 1. do allocation/charge before lock_page() Pros. - simple and can handle page allocation in the same manner. This will reduce holding time of lock_page() in general. Cons. - we do page allocation even if ->fault() returns error. 2. do charge after unlock_page(). Even if charge fails, it's just OOM. Pros. - no impact to non-memcg path. Cons. - implemenation requires special cares of LRU and we need to modify page_add_new_anon_rmap()... 3. do unlock->charge->lock again method. Pros. - no impact to non-memcg path. Cons. - This may kill LOCK_PAGE_RETRY optimization. We need to release lock and get it again... This patch moves "charge" and memory allocation for COW page before lock_page(). Then, we can avoid scanning LRU with holding a lock on a page and latency under lock_page() will be reduced. Then, above livelock disappears. [akpm@linux-foundation.org: fix code layout] Signed-off-by: KAMEZAWA Hiroyuki Reported-by: Lutz Vieweg Original-idea-by: Michal Hocko Cc: Michal Hocko Cc: Ying Han Cc: Johannes Weiner Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index a58bbebb3070..3c9f3aa8332e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3093,14 +3093,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *page_table; spinlock_t *ptl; struct page *page; + struct page *cow_page; pte_t entry; int anon = 0; - int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; int page_mkwrite = 0; + /* + * If we do COW later, allocate page befor taking lock_page() + * on the file cache page. This will reduce lock holding time. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!cow_page) + return VM_FAULT_OOM; + + if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { + page_cache_release(cow_page); + return VM_FAULT_OOM; + } + } else + cow_page = NULL; + vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; @@ -3109,12 +3129,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; + goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - return VM_FAULT_HWPOISON; + ret = VM_FAULT_HWPOISON; + goto uncharge_out; } /* @@ -3132,23 +3153,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + page = cow_page; anon = 1; - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto out; - } - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - page_cache_release(page); - goto out; - } - charged = 1; copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3217,8 +3223,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, page_table); } else { - if (charged) - mem_cgroup_uncharge_page(page); + if (cow_page) + mem_cgroup_uncharge_page(cow_page); if (anon) page_cache_release(page); else @@ -3227,7 +3233,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); -out: if (dirty_page) { struct address_space *mapping = page->mapping; @@ -3257,6 +3262,13 @@ out: unwritable_page: page_cache_release(page); return ret; +uncharge_out: + /* fs's fault handler get error */ + if (cow_page) { + mem_cgroup_uncharge_page(cow_page); + page_cache_release(cow_page); + } + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From cd38b115d5ad79b0100ac6daa103c4fe2c50a913 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jul 2011 17:12:29 -0700 Subject: mm: page allocator: initialise ZLC for first zone eligible for zone_reclaim There have been a small number of complaints about significant stalls while copying large amounts of data on NUMA machines reported on a distribution bugzilla. In these cases, zone_reclaim was enabled by default due to large NUMA distances. In general, the complaints have not been about the workload itself unless it was a file server (in which case the recommendation was disable zone_reclaim). The stalls are mostly due to significant amounts of time spent scanning the preferred zone for pages to free. After a failure, it might fallback to another node (as zonelists are often node-ordered rather than zone-ordered) but stall quickly again when the next allocation attempt occurs. In bad cases, each page allocated results in a full scan of the preferred zone. Patch 1 checks the preferred zone for recent allocation failure which is particularly important if zone_reclaim has failed recently. This avoids rescanning the zone in the near future and instead falling back to another node. This may hurt node locality in some cases but a failure to zone_reclaim is more expensive than a remote access. Patch 2 clears the zlc information after direct reclaim. Otherwise, zone_reclaim can mark zones full, direct reclaim can reclaim enough pages but the zone is still not considered for allocation. This was tested on a 24-thread 2-node x86_64 machine. The tests were focused on large amounts of IO. All tests were bound to the CPUs on node-0 to avoid disturbances due to processes being scheduled on different nodes. The kernels tested are 3.0-rc6-vanilla Vanilla 3.0-rc6 zlcfirst Patch 1 applied zlcreconsider Patches 1+2 applied FS-Mark ./fs_mark -d /tmp/fsmark-10813 -D 100 -N 5000 -n 208 -L 35 -t 24 -S0 -s 524288 fsmark-3.0-rc6 3.0-rc6 3.0-rc6 vanilla zlcfirs zlcreconsider Files/s min 54.90 ( 0.00%) 49.80 (-10.24%) 49.10 (-11.81%) Files/s mean 100.11 ( 0.00%) 135.17 (25.94%) 146.93 (31.87%) Files/s stddev 57.51 ( 0.00%) 138.97 (58.62%) 158.69 (63.76%) Files/s max 361.10 ( 0.00%) 834.40 (56.72%) 802.40 (55.00%) Overhead min 76704.00 ( 0.00%) 76501.00 ( 0.27%) 77784.00 (-1.39%) Overhead mean 1485356.51 ( 0.00%) 1035797.83 (43.40%) 1594680.26 (-6.86%) Overhead stddev 1848122.53 ( 0.00%) 881489.88 (109.66%) 1772354.90 ( 4.27%) Overhead max 7989060.00 ( 0.00%) 3369118.00 (137.13%) 10135324.00 (-21.18%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 501.49 493.91 499.93 Total Elapsed Time (seconds) 2451.57 2257.48 2215.92 MMTests Statistics: vmstat Page Ins 46268 63840 66008 Page Outs 90821596 90671128 88043732 Swap Ins 0 0 0 Swap Outs 0 0 0 Direct pages scanned 13091697 8966863 8971790 Kswapd pages scanned 0 1830011 1831116 Kswapd pages reclaimed 0 1829068 1829930 Direct pages reclaimed 13037777 8956828 8648314 Kswapd efficiency 100% 99% 99% Kswapd velocity 0.000 810.643 826.346 Direct efficiency 99% 99% 96% Direct velocity 5340.128 3972.068 4048.788 Percentage direct scans 100% 83% 83% Page writes by reclaim 0 3 0 Slabs scanned 796672 720640 720256 Direct inode steals 7422667 7160012 7088638 Kswapd inode steals 0 1736840 2021238 Test completes far faster with a large increase in the number of files created per second. Standard deviation is high as a small number of iterations were much higher than the mean. The number of pages scanned by zone_reclaim is reduced and kswapd is used for more work. LARGE DD 3.0-rc6 3.0-rc6 3.0-rc6 vanilla zlcfirst zlcreconsider download tar 59 ( 0.00%) 59 ( 0.00%) 55 ( 7.27%) dd source files 527 ( 0.00%) 296 (78.04%) 320 (64.69%) delete source 36 ( 0.00%) 19 (89.47%) 20 (80.00%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 125.03 118.98 122.01 Total Elapsed Time (seconds) 624.56 375.02 398.06 MMTests Statistics: vmstat Page Ins 3594216 439368 407032 Page Outs 23380832 23380488 23377444 Swap Ins 0 0 0 Swap Outs 0 436 287 Direct pages scanned 17482342 69315973 82864918 Kswapd pages scanned 0 519123 575425 Kswapd pages reclaimed 0 466501 522487 Direct pages reclaimed 5858054 2732949 2712547 Kswapd efficiency 100% 89% 90% Kswapd velocity 0.000 1384.254 1445.574 Direct efficiency 33% 3% 3% Direct velocity 27991.453 184832.737 208171.929 Percentage direct scans 100% 99% 99% Page writes by reclaim 0 5082 13917 Slabs scanned 17280 29952 35328 Direct inode steals 115257 1431122 332201 Kswapd inode steals 0 0 979532 This test downloads a large tarfile and copies it with dd a number of times - similar to the most recent bug report I've dealt with. Time to completion is reduced. The number of pages scanned directly is still disturbingly high with a low efficiency but this is likely due to the number of dirty pages encountered. The figures could probably be improved with more work around how kswapd is used and how dirty pages are handled but that is separate work and this result is significant on its own. Streaming Mapped Writer MMTests Statistics: duration User/Sys Time Running Test (seconds) 124.47 111.67 112.64 Total Elapsed Time (seconds) 2138.14 1816.30 1867.56 MMTests Statistics: vmstat Page Ins 90760 89124 89516 Page Outs 121028340 120199524 120736696 Swap Ins 0 86 55 Swap Outs 0 0 0 Direct pages scanned 114989363 96461439 96330619 Kswapd pages scanned 56430948 56965763 57075875 Kswapd pages reclaimed 27743219 27752044 27766606 Direct pages reclaimed 49777 46884 36655 Kswapd efficiency 49% 48% 48% Kswapd velocity 26392.541 31363.631 30561.736 Direct efficiency 0% 0% 0% Direct velocity 53780.091 53108.759 51581.004 Percentage direct scans 67% 62% 62% Page writes by reclaim 385 122 1513 Slabs scanned 43008 39040 42112 Direct inode steals 0 10 8 Kswapd inode steals 733 534 477 This test just creates a large file mapping and writes to it linearly. Time to completion is again reduced. The gains are mostly down to two things. In many cases, there is less scanning as zone_reclaim simply gives up faster due to recent failures. The second reason is that memory is used more efficiently. Instead of scanning the preferred zone every time, the allocator falls back to another zone and uses it instead improving overall memory utilisation. This patch: initialise ZLC for first zone eligible for zone_reclaim. The zonelist cache (ZLC) is used among other things to record if zone_reclaim() failed for a particular zone recently. The intention is to avoid a high cost scanning extremely long zonelists or scanning within the zone uselessly. Currently the zonelist cache is setup only after the first zone has been considered and zone_reclaim() has been called. The objective was to avoid a costly setup but zone_reclaim is itself quite expensive. If it is failing regularly such as the first eligible zone having mostly mapped pages, the cost in scanning and allocation stalls is far higher than the ZLC initialisation step. This patch initialises ZLC before the first eligible zone calls zone_reclaim(). Once initialised, it is checked whether the zone failed zone_reclaim recently. If it has, the zone is skipped. As the first zone is now being checked, additional care has to be taken about zones marked full. A zone can be marked "full" because it should not have enough unmapped pages for zone_reclaim but this is excessive as direct reclaim or kswapd may succeed where zone_reclaim fails. Only mark zones "full" after zone_reclaim fails if it failed to reclaim enough pages after scanning. Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9119faae6e6a..830a465958de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1664,7 +1664,7 @@ zonelist_scan: continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) - goto try_next_zone; + continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1676,17 +1676,36 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup if there are multiple nodes + * and before considering the first zone allowed + * by the cpuset. + */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + if (zone_reclaim_mode == 0) goto this_zone_full; + /* + * As we may have just activated ZLC, check if the first + * eligible zone has failed zone_reclaim recently. + */ + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + ret = zone_reclaim(zone, gfp_mask, order); switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ - goto try_next_zone; + continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ - goto this_zone_full; + continue; default: /* did we reclaim enough */ if (!zone_watermark_ok(zone, order, mark, @@ -1703,16 +1722,6 @@ try_this_zone: this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); -try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup after the first zone is tried but only - * if there are multiple nodes make it worthwhile - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } } if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { -- cgit v1.2.3 From 76d3fbf8fbf6cc78ceb63549e0e0c5bc8a88f838 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jul 2011 17:12:30 -0700 Subject: mm: page allocator: reconsider zones for allocation after direct reclaim With zone_reclaim_mode enabled, it's possible for zones to be considered full in the zonelist_cache so they are skipped in the future. If the process enters direct reclaim, the ZLC may still consider zones to be full even after reclaiming pages. Reconsider all zones for allocation if direct reclaim returns successfully. Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 830a465958de..094472377d81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) set_bit(i, zlc->fullzones); } +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1963,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; + /* After successful reclaim, reconsider all zones for allocation */ + if (NUMA_BUILD) + zlc_clear_zones_full(zonelist); + retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, -- cgit v1.2.3 From 72c4783210f77fd743f0a316858d33f27db51e7c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 25 Jul 2011 17:12:31 -0700 Subject: mm: remove useless rcu lock-unlock from mapping_tagged() radix_tree_tagged() is lockless - it reads from a member of the raid-tree root node. It does not require any protection. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f698862420..919b45eb57ad 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1405,10 +1405,6 @@ EXPORT_SYMBOL(test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - int ret; - rcu_read_lock(); - ret = radix_tree_tagged(&mapping->page_tree, tag); - rcu_read_unlock(); - return ret; + return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); -- cgit v1.2.3 From 2efaca927f5cd7ecd0f1554b8f9b6a9a2c329c03 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 25 Jul 2011 17:12:32 -0700 Subject: mm/futex: fix futex writes on archs with SW tracking of dirty & young I haven't reproduced it myself but the fail scenario is that on such machines (notably ARM and some embedded powerpc), if you manage to hit that futex path on a writable page whose dirty bit has gone from the PTE, you'll livelock inside the kernel from what I can tell. It will go in a loop of trying the atomic access, failing, trying gup to "fix it up", getting succcess from gup, go back to the atomic access, failing again because dirty wasn't fixed etc... So I think you essentially hang in the kernel. The scenario is probably rare'ish because affected architecture are embedded and tend to not swap much (if at all) so we probably rarely hit the case where dirty is missing or young is missing, but I think Shan has a piece of SW that can reliably reproduce it using a shared writable mapping & fork or something like that. On archs who use SW tracking of dirty & young, a page without dirty is effectively mapped read-only and a page without young unaccessible in the PTE. Additionally, some architectures might lazily flush the TLB when relaxing write protection (by doing only a local flush), and expect a fault to invalidate the stale entry if it's still present on another processor. The futex code assumes that if the "in_atomic()" access -EFAULT's, it can "fix it up" by causing get_user_pages() which would then be equivalent to taking the fault. However that isn't the case. get_user_pages() will not call handle_mm_fault() in the case where the PTE seems to have the right permissions, regardless of the dirty and young state. It will eventually update those bits ... in the struct page, but not in the PTE. Additionally, it will not handle the lazy TLB flushing that can be required by some architectures in the fault case. Basically, gup is the wrong interface for the job. The patch provides a more appropriate one which boils down to just calling handle_mm_fault() since what we are trying to do is simulate a real page fault. The futex code currently attempts to write to user memory within a pagefault disabled section, and if that fails, tries to fix it up using get_user_pages(). This doesn't work on archs where the dirty and young bits are maintained by software, since they will gate access permission in the TLB, and will not be updated by gup(). In addition, there's an expectation on some archs that a spurious write fault triggers a local TLB flush, and that is missing from the picture as well. I decided that adding those "features" to gup() would be too much for this already too complex function, and instead added a new simpler fixup_user_fault() which is essentially a wrapper around handle_mm_fault() which the futex code can call. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix some nits Darren saw, fiddle comment layout] Signed-off-by: Benjamin Herrenschmidt Reported-by: Shan Hai Tested-by: Shan Hai Cc: David Laight Acked-by: Peter Zijlstra Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ kernel/futex.c | 4 ++-- mm/memory.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3cccd053850f..3172a1c0f08e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -988,6 +988,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); diff --git a/kernel/futex.c b/kernel/futex.c index 3fbc76cbb9aa..0a308970c24a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; diff --git a/mm/memory.c b/mm/memory.c index 3c9f3aa8332e..a56e3ba816b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1805,7 +1805,63 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. -- cgit v1.2.3 From 708e3508c2a2204cc276dcdb543009a441bfe91b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:32 -0700 Subject: tmpfs: clone shmem_file_splice_read() Copy __generic_file_splice_read() and generic_file_splice_read() from fs/splice.c to shmem_file_splice_read() in mm/shmem.c. Make page_cache_pipe_buf_ops and spd_release_page() accessible to it. Signed-off-by: Hugh Dickins Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/splice.c | 4 +- include/linux/splice.h | 2 + mm/shmem.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 221 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/splice.c b/fs/splice.c index aa866d309695..fa2defa8afcf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -132,7 +132,7 @@ error: return err; } -static const struct pipe_buf_operations page_cache_pipe_buf_ops = { +const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -264,7 +264,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } -static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { page_cache_release(spd->pages[i]); } diff --git a/include/linux/splice.h b/include/linux/splice.h index 997c3b4c212b..26e5b613deda 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -88,5 +88,7 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); extern void splice_shrink_spd(struct pipe_inode_info *, struct splice_pipe_desc *); +extern void spd_release_page(struct splice_pipe_desc *, unsigned int); +extern const struct pipe_buf_operations page_cache_pipe_buf_ops; #endif diff --git a/mm/shmem.c b/mm/shmem.c index c1db11cf220d..d176e488f04d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include #include @@ -1844,6 +1845,221 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, return retval; } +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize, left; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, pipe->buffers); + + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, + nr_pages, spd.pages); + index += spd.nr_pages; + + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * readahead/allocate the rest and fill in the holes. + */ + if (spd.nr_pages < nr_pages) + page_cache_sync_readahead(mapping, &in->f_ra, in, + index, req_pages - spd.nr_pages); + + error = 0; + while (spd.nr_pages < nr_pages) { + /* + * Page could be there, find_get_pages_contig() breaks on + * the first hole. + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * page didn't exist, allocate one. + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL); + if (unlikely(error)) { + page_cache_release(page); + if (error == -EEXIST) + continue; + break; + } + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); + } + + spd.pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &in->f_ra, in, + page, index, req_pages - page_nr); + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. + */ + if (!page->mapping) { + unlock_page(page); + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); + if (unlikely(error)) { + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ + if (error == AOP_TRUNCATED_PAGE) + error = 0; + + break; + } + } +fill_it: + /* + * i_size must be checked after PageUptodate. + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + unsigned int plen; + + /* + * max good bytes in this page + */ + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + /* + * force quit after adding this page + */ + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + /* + * Release any pages at the end, if we quit early. 'page_nr' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(pipe, &spd); + + if (error > 0) { + *ppos += error; + file_accessed(in); + } + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2699,7 +2915,7 @@ static const struct file_operations shmem_file_operations = { .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, #endif }; -- cgit v1.2.3 From 71f0e07a605fad1fb6b288e4dc1dd8dfa78f4872 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:33 -0700 Subject: tmpfs: refine shmem_file_splice_read Tidy up shmem_file_splice_read(): Remove readahead: okay, we could implement shmem readahead on swap, but have never done so before, swap being the slow exceptional path. Use shmem_getpage() instead of find_or_create_page() plus ->readpage(). Remove several comments: sorry, I found them more distracting than helpful, and this will not be the reference version of splice_read(). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 138 +++++++++---------------------------------------------------- 1 file changed, 19 insertions(+), 119 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index d176e488f04d..f96614526d1c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1850,6 +1850,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; + struct inode *inode = mapping->host; unsigned int loff, nr_pages, req_pages; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; @@ -1865,7 +1866,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; - isize = i_size_read(in->f_mapping->host); + isize = i_size_read(inode); if (unlikely(*ppos >= isize)) return 0; @@ -1881,153 +1882,57 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nr_pages = min(req_pages, pipe->buffers); - /* - * Lookup the (hopefully) full range of pages we need. - */ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; - - /* - * If find_get_pages_contig() returned fewer pages than we needed, - * readahead/allocate the rest and fill in the holes. - */ - if (spd.nr_pages < nr_pages) - page_cache_sync_readahead(mapping, &in->f_ra, in, - index, req_pages - spd.nr_pages); - error = 0; - while (spd.nr_pages < nr_pages) { - /* - * Page could be there, find_get_pages_contig() breaks on - * the first hole. - */ - page = find_get_page(mapping, index); - if (!page) { - /* - * page didn't exist, allocate one. - */ - page = page_cache_alloc_cold(mapping); - if (!page) - break; - - error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); - if (unlikely(error)) { - page_cache_release(page); - if (error == -EEXIST) - continue; - break; - } - /* - * add_to_page_cache() locks the page, unlock it - * to avoid convoluting the logic below even more. - */ - unlock_page(page); - } + while (spd.nr_pages < nr_pages) { + page = NULL; + error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); spd.pages[spd.nr_pages++] = page; index++; } - /* - * Now loop over the map and see if we need to start IO on any - * pages, fill in the partial map, etc. - */ index = *ppos >> PAGE_CACHE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { unsigned int this_len; if (!len) break; - /* - * this_len is the max we'll use from this page - */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); page = spd.pages[page_nr]; - if (PageReadahead(page)) - page_cache_async_readahead(mapping, &in->f_ra, in, - page, index, req_pages - page_nr); - - /* - * If the page isn't uptodate, we may need to start io on it - */ - if (!PageUptodate(page)) { - lock_page(page); - - /* - * Page was truncated, or invalidated by the - * filesystem. Redo the find/create, but this time the - * page is kept locked, so there's no chance of another - * race with truncate/invalidate. - */ - if (!page->mapping) { - unlock_page(page); - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - - if (!page) { - error = -ENOMEM; - break; - } - page_cache_release(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } - /* - * page was already under io and is now done, great - */ - if (PageUptodate(page)) { - unlock_page(page); - goto fill_it; - } - - /* - * need to read in the page - */ - error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - /* - * We really should re-lookup the page here, - * but it complicates things a lot. Instead - * lets just do what we already stored, and - * we'll get it the next time we are called. - */ - if (error == AOP_TRUNCATED_PAGE) - error = 0; - + if (!PageUptodate(page) || page->mapping != mapping) { + page = NULL; + error = shmem_getpage(inode, index, &page, + SGP_CACHE, NULL); + if (error) break; - } + unlock_page(page); + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } -fill_it: - /* - * i_size must be checked after PageUptodate. - */ - isize = i_size_read(mapping->host); + + isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) break; - /* - * if this is the last page, see if we need to shrink - * the length and stop - */ if (end_index == index) { unsigned int plen; - /* - * max good bytes in this page - */ plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (plen <= loff) break; - /* - * force quit after adding this page - */ this_len = min(this_len, plen - loff); len = this_len; } @@ -2040,13 +1945,8 @@ fill_it: index++; } - /* - * Release any pages at the end, if we quit early. 'page_nr' is how far - * we got, 'nr_pages' is how many pages are in the map. - */ while (page_nr < nr_pages) page_cache_release(spd.pages[page_nr++]); - in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); -- cgit v1.2.3 From 68da9f055755ee2609a1686722e6d6a7980019ee Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:34 -0700 Subject: tmpfs: pass gfp to shmem_getpage_gfp Make shmem_getpage() a wrapper, passing mapping_gfp_mask() down to shmem_getpage_gfp(), which in turn passes gfp down to shmem_swp_alloc(). Change shmem_read_mapping_page_gfp() to use shmem_getpage_gfp() in the CONFIG_SHMEM case; but leave tiny !SHMEM using read_cache_page_gfp(). Add a BUG_ON() in case anyone happens to call this on a non-shmem mapping; though we might later want to let that case route to read_cache_page_gfp(). It annoys me to have these two almost-redundant args, gfp and fault_type: I can't find a better way; but initialize fault_type only in shmem_fault(). Note that before, read_cache_page_gfp() was allocating i915_gem's pages with __GFP_NORETRY as intended; but the corresponding swap vector pages got allocated without it, leaving a small possibility of OOM. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 67 +++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f96614526d1c..f6c94ba87808 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -127,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type); +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + +static inline int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, int *fault_type) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), fault_type); +} static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) { @@ -404,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns * @info: info structure for the inode * @index: index of the page to find * @sgp: check and recheck i_size? skip allocation? + * @gfp: gfp mask to use for any page allocation * * If the entry does not exist, allocate it. */ -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) +static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, + unsigned long index, enum sgp_type sgp, gfp_t gfp) { struct inode *inode = &info->vfs_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -435,7 +444,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + page = shmem_dir_alloc(gfp); spin_lock(&info->lock); if (!page) { @@ -1225,14 +1234,14 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #endif /* - * shmem_getpage - either get the page from swap or allocate a new one + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type) +static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1242,15 +1251,11 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; - gfp_t gfp; int error; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; - if (type) - *type = 0; - /* * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read @@ -1264,13 +1269,12 @@ repeat: filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; - gfp = mapping_gfp_mask(mapping); if (!filepage) { /* * Try to preload while we can wait, to not make a habit of * draining atomic reserves; but don't latch on to this cpu. */ - error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); if (error) goto failed; radix_tree_preload_end(); @@ -1290,7 +1294,7 @@ repeat: spin_lock(&info->lock); shmem_recalc_inode(inode); - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) { spin_unlock(&info->lock); error = PTR_ERR(entry); @@ -1305,12 +1309,12 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); /* here we actually do the io */ - if (type) - *type |= VM_FAULT_MAJOR; + if (fault_type) + *fault_type |= VM_FAULT_MAJOR; swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { @@ -1461,7 +1465,7 @@ repeat: SetPageSwapBacked(filepage); } - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { @@ -1539,7 +1543,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; int error; - int ret; + int ret = VM_FAULT_LOCKED; if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; @@ -1547,11 +1551,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (ret & VM_FAULT_MAJOR) { count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); } - return ret | VM_FAULT_LOCKED; + return ret; } #ifdef CONFIG_NUMA @@ -3162,13 +3167,29 @@ int shmem_zero_setup(struct vm_area_struct *vma) * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * - * Provide a stub for those callers to start using now, then later - * flesh it out to call shmem_getpage() with additional gfp mask, when - * shmem_file_splice_read() is added and shmem_readpage() is removed. + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { +#ifdef CONFIG_SHMEM + struct inode *inode = mapping->host; + struct page *page = NULL; + int error; + + BUG_ON(mapping->a_ops != &shmem_aops); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + if (error) + page = ERR_PTR(error); + else + unlock_page(page); + return page; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ return read_cache_page_gfp(mapping, index, gfp); +#endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); -- cgit v1.2.3 From 9276aad6c898dbcc31d095f2934dedd5cbb2e93e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:34 -0700 Subject: tmpfs: remove_shmem_readpage Remove that pernicious shmem_readpage() at last: the things we needed it for (splice, loop, sendfile, i915 GEM) are now fully taken care of by shmem_file_splice_read() and shmem_read_mapping_page_gfp(). This removal clears the way for a simpler shmem_getpage_gfp(), since page is never passed in; but leave most of that cleanup until after. sys_readahead() and sys_fadvise(POSIX_FADV_WILLNEED) will now EINVAL, instead of unexpectedly trying to read ahead on tmpfs: if that proves to be an issue for someone, then we can either arrange for them to return success instead, or try to implement async readahead on tmpfs. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f6c94ba87808..ff6713a2579e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1246,7 +1246,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct page *filepage = *pagep; + struct page *filepage; struct page *swappage; struct page *prealloc_page = NULL; swp_entry_t *entry; @@ -1255,18 +1255,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, if (idx >= SHMEM_MAX_INDEX) return -EFBIG; - - /* - * Normally, filepage is NULL on entry, and either found - * uptodate immediately, or allocated and zeroed, or read - * in under swappage, which is then assigned to filepage. - * But shmem_readpage (required for splice) passes in a locked - * filepage, which may be found not uptodate by other callers - * too, and may need to be copied from the swappage read in. - */ repeat: - if (!filepage) - filepage = find_lock_page(mapping, idx); + filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; if (!filepage) { @@ -1513,8 +1503,7 @@ nospace: * Perhaps the page was brought in from swap between find_lock_page * and taking info->lock? We allow for that at add_to_page_cache_lru, * but must also avoid reporting a spurious ENOSPC while working on a - * full tmpfs. (When filepage has been passed in to shmem_getpage, it - * is already in page cache, which prevents this race from occurring.) + * full tmpfs. */ if (!filepage) { struct page *page = find_get_page(mapping, idx); @@ -1527,7 +1516,7 @@ nospace: spin_unlock(&info->lock); error = -ENOSPC; failed: - if (*pagep != filepage) { + if (filepage) { unlock_page(filepage); page_cache_release(filepage); } @@ -1673,19 +1662,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inline_operations; -/* - * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; - * but providing them allows a tmpfs file to be used for splice, sendfile, and - * below the loop driver, in the generic fashion that many filesystems support. - */ -static int shmem_readpage(struct file *file, struct page *page) -{ - struct inode *inode = page->mapping->host; - int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); - unlock_page(page); - return error; -} - static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -1693,7 +1669,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = NULL; return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1893,7 +1868,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, error = 0; while (spd.nr_pages < nr_pages) { - page = NULL; error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); if (error) break; @@ -1916,7 +1890,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { - page = NULL; error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); if (error) @@ -2125,7 +2098,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int error; int len; struct inode *inode; - struct page *page = NULL; + struct page *page; char *kaddr; struct shmem_inode_info *info; @@ -2803,7 +2776,6 @@ static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS - .readpage = shmem_readpage, .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif @@ -3175,7 +3147,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; - struct page *page = NULL; + struct page *page; int error; BUG_ON(mapping->a_ops != &shmem_aops); -- cgit v1.2.3 From e83c32e8f92724a06a22a3b42f3afc07db93e131 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:35 -0700 Subject: tmpfs: simplify prealloc_page The prealloc_page handling in shmem_getpage_gfp() is unnecessarily complicated: first simplify that before going on to filepage/swappage. That's right, don't report ENOMEM when the preallocation fails: we may or may not need the page. But simply report ENOMEM once we find we do need it, instead of dropping lock, repeating allocation, unwinding on failure etc. And leave the out label on the fast path, don't goto. Fix something that looks like a bug but turns out not to be: set PageSwapBacked on prealloc_page before its mem_cgroup_cache_charge(), as the removed case was doing. That's important before adding to LRU (determines which LRU the page goes on), and does affect which path it takes through memcontrol.c, but in the end MEM_CGROUP_CHANGE_TYPE_ SHMEM is handled no differently from CACHE. Signed-off-by: Hugh Dickins Acked-by: Shaohua Li Cc: "Zhang, Yanmin" Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 60 ++++++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index ff6713a2579e..8f8534f35476 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1269,9 +1269,9 @@ repeat: goto failed; radix_tree_preload_end(); if (sgp != SGP_READ && !prealloc_page) { - /* We don't care if this fails */ prealloc_page = shmem_alloc_page(gfp, info, idx); if (prealloc_page) { + SetPageSwapBacked(prealloc_page); if (mem_cgroup_cache_charge(prealloc_page, current->mm, GFP_KERNEL)) { page_cache_release(prealloc_page); @@ -1403,7 +1403,8 @@ repeat: goto repeat; } spin_unlock(&info->lock); - } else { + + } else if (prealloc_page) { shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { @@ -1419,41 +1420,8 @@ repeat: if (!filepage) { int ret; - if (!prealloc_page) { - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - spin_lock(&info->lock); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); - - /* - * Precharge page while we can wait, compensate - * after - */ - error = mem_cgroup_cache_charge(filepage, - current->mm, GFP_KERNEL); - if (error) { - page_cache_release(filepage); - spin_lock(&info->lock); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - filepage = NULL; - goto failed; - } - - spin_lock(&info->lock); - } else { - filepage = prealloc_page; - prealloc_page = NULL; - SetPageSwapBacked(filepage); - } + filepage = prealloc_page; + prealloc_page = NULL; entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) @@ -1492,11 +1460,20 @@ repeat: SetPageUptodate(filepage); if (sgp == SGP_DIRTY) set_page_dirty(filepage); + } else { + spin_unlock(&info->lock); + error = -ENOMEM; + goto out; } done: *pagep = filepage; error = 0; - goto out; +out: + if (prealloc_page) { + mem_cgroup_uncharge_cache_page(prealloc_page); + page_cache_release(prealloc_page); + } + return error; nospace: /* @@ -1520,12 +1497,7 @@ failed: unlock_page(filepage); page_cache_release(filepage); } -out: - if (prealloc_page) { - mem_cgroup_uncharge_cache_page(prealloc_page); - page_cache_release(prealloc_page); - } - return error; + goto out; } static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -- cgit v1.2.3 From 27ab700626f048407e9466d389a43c7d3aa45967 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:36 -0700 Subject: tmpfs: simplify filepage/swappage We can now simplify shmem_getpage_gfp(): there is no longer a dilemma of filepage passed in via shmem_readpage(), then swappage found, which must then be copied over to it. Although at first it's tempting to replace the **pagep arg by returning struct page *, that makes a mess of IS_ERR_OR_NULL(page)s in all the callers, so leave as is. Insert BUG_ON(!PageUptodate) when we find and lock page: some of the complication came from uninitialized pages inserted into filecache prior to readpage; but now we're in control, and only release pagelock on filecache once it's uptodate (if an error occurs in reading back from swap, the page remains in swapcache, never moved to filecache). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 237 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 108 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 8f8534f35476..bf6e9c11d859 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1246,41 +1246,47 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct page *filepage; - struct page *swappage; + struct page *page; struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; int error; + int ret; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; repeat: - filepage = find_lock_page(mapping, idx); - if (filepage && PageUptodate(filepage)) - goto done; - if (!filepage) { + page = find_lock_page(mapping, idx); + if (page) { /* - * Try to preload while we can wait, to not make a habit of - * draining atomic reserves; but don't latch on to this cpu. + * Once we can get the page lock, it must be uptodate: + * if there were an error in reading back from swap, + * the page would not be inserted into the filecache. */ - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); - if (error) - goto failed; - radix_tree_preload_end(); - if (sgp != SGP_READ && !prealloc_page) { - prealloc_page = shmem_alloc_page(gfp, info, idx); - if (prealloc_page) { - SetPageSwapBacked(prealloc_page); - if (mem_cgroup_cache_charge(prealloc_page, - current->mm, GFP_KERNEL)) { - page_cache_release(prealloc_page); - prealloc_page = NULL; - } + BUG_ON(!PageUptodate(page)); + goto done; + } + + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu. + */ + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (error) + goto out; + radix_tree_preload_end(); + + if (sgp != SGP_READ && !prealloc_page) { + prealloc_page = shmem_alloc_page(gfp, info, idx); + if (prealloc_page) { + SetPageSwapBacked(prealloc_page); + if (mem_cgroup_cache_charge(prealloc_page, + current->mm, GFP_KERNEL)) { + page_cache_release(prealloc_page); + prealloc_page = NULL; } } } - error = 0; spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1288,21 +1294,21 @@ repeat: if (IS_ERR(entry)) { spin_unlock(&info->lock); error = PTR_ERR(entry); - goto failed; + goto out; } swap = *entry; if (swap.val) { /* Look it up and read it in.. */ - swappage = lookup_swap_cache(swap); - if (!swappage) { + page = lookup_swap_cache(swap); + if (!page) { shmem_swp_unmap(entry); spin_unlock(&info->lock); /* here we actually do the io */ if (fault_type) *fault_type |= VM_FAULT_MAJOR; - swappage = shmem_swapin(swap, gfp, info, idx); - if (!swappage) { + page = shmem_swapin(swap, gfp, info, idx); + if (!page) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) @@ -1314,62 +1320,42 @@ repeat: } spin_unlock(&info->lock); if (error) - goto failed; + goto out; goto repeat; } - wait_on_page_locked(swappage); - page_cache_release(swappage); + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } /* We have to do this with page locked to prevent races */ - if (!trylock_page(swappage)) { + if (!trylock_page(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - wait_on_page_locked(swappage); - page_cache_release(swappage); + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } - if (PageWriteback(swappage)) { + if (PageWriteback(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - wait_on_page_writeback(swappage); - unlock_page(swappage); - page_cache_release(swappage); + wait_on_page_writeback(page); + unlock_page(page); + page_cache_release(page); goto repeat; } - if (!PageUptodate(swappage)) { + if (!PageUptodate(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - unlock_page(swappage); - page_cache_release(swappage); + unlock_page(page); + page_cache_release(page); error = -EIO; - goto failed; + goto out; } - if (filepage) { - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - copy_highpage(filepage, swappage); - unlock_page(swappage); - page_cache_release(swappage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); - set_page_dirty(filepage); - swap_free(swap); - } else if (!(error = add_to_page_cache_locked(swappage, mapping, - idx, GFP_NOWAIT))) { - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - filepage = swappage; - set_page_dirty(filepage); - swap_free(swap); - } else { + error = add_to_page_cache_locked(page, mapping, + idx, GFP_NOWAIT); + if (error) { shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { @@ -1378,28 +1364,33 @@ repeat: * call memcg's OOM if needed. */ error = mem_cgroup_shmem_charge_fallback( - swappage, - current->mm, - gfp); + page, current->mm, gfp); if (error) { - unlock_page(swappage); - page_cache_release(swappage); - goto failed; + unlock_page(page); + page_cache_release(page); + goto out; } } - unlock_page(swappage); - page_cache_release(swappage); + unlock_page(page); + page_cache_release(page); goto repeat; } - } else if (sgp == SGP_READ && !filepage) { + + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); - filepage = find_get_page(mapping, idx); - if (filepage && - (!PageUptodate(filepage) || !trylock_page(filepage))) { + delete_from_swap_cache(page); + spin_unlock(&info->lock); + set_page_dirty(page); + swap_free(swap); + + } else if (sgp == SGP_READ) { + shmem_swp_unmap(entry); + page = find_get_page(mapping, idx); + if (page && !trylock_page(page)) { spin_unlock(&info->lock); - wait_on_page_locked(filepage); - page_cache_release(filepage); - filepage = NULL; + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } spin_unlock(&info->lock); @@ -1417,56 +1408,52 @@ repeat: } else if (shmem_acct_block(info->flags)) goto nospace; - if (!filepage) { - int ret; + page = prealloc_page; + prealloc_page = NULL; - filepage = prealloc_page; - prealloc_page = NULL; - - entry = shmem_swp_alloc(info, idx, sgp, gfp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - swap = *entry; - shmem_swp_unmap(entry); - } - ret = error || swap.val; - if (ret) - mem_cgroup_uncharge_cache_page(filepage); - else - ret = add_to_page_cache_lru(filepage, mapping, + entry = shmem_swp_alloc(info, idx, sgp, gfp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + swap = *entry; + shmem_swp_unmap(entry); + } + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(page); + else + ret = add_to_page_cache_lru(page, mapping, idx, GFP_NOWAIT); - /* - * At add_to_page_cache_lru() failure, uncharge will - * be done automatically. - */ - if (ret) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - page_cache_release(filepage); - filepage = NULL; - if (error) - goto failed; - goto repeat; - } - info->flags |= SHMEM_PAGEIN; + /* + * At add_to_page_cache_lru() failure, + * uncharge will be done automatically. + */ + if (ret) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); + page_cache_release(page); + if (error) + goto out; + goto repeat; } + info->flags |= SHMEM_PAGEIN; info->alloced++; spin_unlock(&info->lock); - clear_highpage(filepage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); if (sgp == SGP_DIRTY) - set_page_dirty(filepage); + set_page_dirty(page); + } else { spin_unlock(&info->lock); error = -ENOMEM; goto out; } done: - *pagep = filepage; + *pagep = page; error = 0; out: if (prealloc_page) { @@ -1482,21 +1469,13 @@ nospace: * but must also avoid reporting a spurious ENOSPC while working on a * full tmpfs. */ - if (!filepage) { - struct page *page = find_get_page(mapping, idx); - if (page) { - spin_unlock(&info->lock); - page_cache_release(page); - goto repeat; - } - } + page = find_get_page(mapping, idx); spin_unlock(&info->lock); - error = -ENOSPC; -failed: - if (filepage) { - unlock_page(filepage); - page_cache_release(filepage); + if (page) { + page_cache_release(page); + goto repeat; } + error = -ENOSPC; goto out; } -- cgit v1.2.3 From 48f170fb7d7db8789ccc23e051af61f62af5f685 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:37 -0700 Subject: tmpfs: simplify unuse and writepage shmem_unuse_inode() and shmem_writepage() contain a little code to cope with pages inserted independently into the filecache, probably by a filesystem stacked on top of tmpfs, then fed to its ->readpage() or ->writepage(). Unionfs was indeed experimenting with working in that way three years ago, but I find no current examples: nowadays the stacking filesystems use vfs interfaces to the lower filesystem. It's now illegal: remove most of that code, adding some WARN_ON_ONCEs. Signed-off-by: Hugh Dickins Cc: Erez Zadok Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 50 ++++++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index bf6e9c11d859..7533574109da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -972,20 +972,7 @@ found: error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); /* which does mem_cgroup_uncharge_cache_page on error */ - if (error == -EEXIST) { - struct page *filepage = find_get_page(mapping, idx); - error = 1; - if (filepage) { - /* - * There might be a more uptodate page coming down - * from a stacked writepage: forget our swappage if so. - */ - if (PageUptodate(filepage)) - error = 0; - page_cache_release(filepage); - } - } - if (!error) { + if (error != -ENOMEM) { delete_from_swap_cache(page); set_page_dirty(page); info->flags |= SHMEM_PAGEIN; @@ -1072,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * shmem_backing_dev_info's capabilities prevent regular writeback or * sync from ever calling shmem_writepage; but a stacking filesystem - * may use the ->writepage of its underlying filesystem, in which case + * might use ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. However, in those cases, - * we do still want to check if there's a redundant swappage to be - * discarded. + * and not for the writeback threads or sync. */ - if (wbc->for_reclaim) - swap = get_swap_page(); - else - swap.val = 0; + if (!wbc->for_reclaim) { + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ + goto redirty; + } + swap = get_swap_page(); + if (!swap.val) + goto redirty; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, @@ -1092,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * we've taken the spinlock, because shmem_unuse_inode() will * prune a !swapped inode from the swaplist under both locks. */ - if (swap.val) { - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); - } + mutex_lock(&shmem_swaplist_mutex); + if (list_empty(&info->swaplist)) + list_add_tail(&info->swaplist, &shmem_swaplist); spin_lock(&info->lock); - if (swap.val) - mutex_unlock(&shmem_swaplist_mutex); + mutex_unlock(&shmem_swaplist_mutex); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); @@ -1108,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } entry = shmem_swp_entry(info, index, NULL); if (entry->val) { - /* - * The more uptodate page coming down from a stacked - * writepage should replace our old swappage. - */ + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } shmem_recalc_inode(inode); - if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { delete_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); -- cgit v1.2.3 From 99b12e3d882bc7ebdfe0de381dff3b16d21c38f7 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 25 Jul 2011 17:12:37 -0700 Subject: writeback: account NR_WRITTEN at IO completion time NR_WRITTEN is now accounted at block IO enqueue time, which is not very accurate as to common understanding. This moves NR_WRITTEN accounting to the IO completion time and makes it more consistent with BDI_WRITTEN, which is used for bandwidth estimation. Signed-off-by: Wu Fengguang Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 919b45eb57ad..d8767b381b9c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1141,7 +1141,6 @@ EXPORT_SYMBOL(account_page_dirtied); void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); } EXPORT_SYMBOL(account_page_writeback); @@ -1358,8 +1357,10 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } - if (ret) + if (ret) { dec_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); + } return ret; } -- cgit v1.2.3 From ae891a1b93bf62e9aaa116a7a71312375047fc9f Mon Sep 17 00:00:00 2001 From: Maxin B John Date: Mon, 25 Jul 2011 17:12:59 -0700 Subject: devres: fix possible use after free devres uses the pointer value as key after it's freed, which is safe but triggers spurious use-after-free warnings on some static analysis tools. Rearrange code to avoid such warnings. Signed-off-by: Maxin B. John Reviewed-by: Rolf Eike Beer Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/devres.c | 2 +- lib/devres.c | 2 +- mm/dmapool.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef4ffcdfa55..bd8e788d71e0 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { struct irq_devres match_data = { irq, dev_id }; - free_irq(irq, dev_id); WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, &match_data)); + free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); diff --git a/lib/devres.c b/lib/devres.c index 6efddf53b90c..7c0e953a7486 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -79,9 +79,9 @@ EXPORT_SYMBOL(devm_ioremap_nocache); */ void devm_iounmap(struct device *dev, void __iomem *addr) { - iounmap(addr); WARN_ON(devres_destroy(dev, devm_ioremap_release, devm_ioremap_match, (void *)addr)); + iounmap(addr); } EXPORT_SYMBOL(devm_iounmap); diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519a..fbb58e346888 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - dma_pool_destroy(pool); WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); + dma_pool_destroy(pool); } EXPORT_SYMBOL(dmam_pool_destroy); -- cgit v1.2.3 From 1f4c025b5a5520fd2571244196b1b01ad96d18f6 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:21 -0700 Subject: memcg: export memory cgroup's swappiness with mem_cgroup_swappiness() Each memory cgroup has a 'swappiness' value which can be accessed by get_swappiness(memcg). The major user is try_to_free_mem_cgroup_pages() and swappiness is passed by argument. It's propagated by scan_control. get_swappiness() is a static function but some planned updates will need to get swappiness from files other than memcontrol.c This patch exports get_swappiness() as mem_cgroup_swappiness(). With this, we can remove the argument of swapiness from try_to_free... and drop swappiness from scan_control. only memcg uses it. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Michal Hocko Cc: Ying Han Cc: Shaohua Li Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 13 +++++++++---- mm/memcontrol.c | 15 +++++++-------- mm/vmscan.c | 23 ++++++++++------------- 3 files changed, 26 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index a273468f8285..44558b600ee3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -252,11 +252,9 @@ static inline void lru_cache_add_file(struct page *page) extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - unsigned int swappiness); + gfp_t gfp_mask, bool noswap); extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, - unsigned int swappiness, struct zone *zone, unsigned long *nr_scanned); extern int __isolate_lru_page(struct page *page, int mode, int file); @@ -299,7 +297,14 @@ static inline void scan_unevictable_unregister_node(struct node *node) extern int kswapd_run(int nid); extern void kswapd_stop(int nid); - +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +extern int mem_cgroup_swappiness(struct mem_cgroup *mem); +#else +static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) +{ + return vm_swappiness; +} +#endif #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d25..506d116a7d33 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -249,7 +249,7 @@ struct mem_cgroup { atomic_t oom_lock; atomic_t refcnt; - unsigned int swappiness; + int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -1329,7 +1329,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) return margin >> PAGE_SHIFT; } -static unsigned int get_swappiness(struct mem_cgroup *memcg) +int mem_cgroup_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1776,12 +1776,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone, - &nr_scanned); + noswap, zone, &nr_scanned); *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, get_swappiness(victim)); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3826,7 +3825,7 @@ try_to_free: goto out; } progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, get_swappiness(mem)); + false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4288,7 +4287,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - return get_swappiness(memcg); + return mem_cgroup_swappiness(memcg); } static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, @@ -4997,7 +4996,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) INIT_LIST_HEAD(&mem->oom_notify); if (parent) - mem->swappiness = get_swappiness(parent); + mem->swappiness = mem_cgroup_swappiness(parent); atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index febbc044e792..05637491f244 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -95,8 +95,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - int swappiness; - int order; /* @@ -1770,6 +1768,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } +static int vmscan_swappiness(struct scan_control *sc) +{ + if (scanning_global_lru(sc)) + return vm_swappiness; + return mem_cgroup_swappiness(sc->mem_cgroup); +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -1830,8 +1835,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; + anon_prio = vmscan_swappiness(sc); + file_prio = 200 - vmscan_swappiness(sc); /* * OK, so we have swap space and a fair amount of page cache @@ -2220,7 +2225,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .nodemask = nodemask, @@ -2244,7 +2248,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, - unsigned int swappiness, struct zone *zone, unsigned long *nr_scanned) { @@ -2254,7 +2257,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem, }; @@ -2283,8 +2285,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap, - unsigned int swappiness) + bool noswap) { struct zonelist *zonelist; unsigned long nr_reclaimed; @@ -2294,7 +2295,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_unmap = 1, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .nodemask = NULL, /* we don't care the placement */ @@ -2445,7 +2445,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * we want to put equal scanning pressure on each zone. */ .nr_to_reclaim = ULONG_MAX, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, }; @@ -2915,7 +2914,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, - .swappiness = vm_swappiness, .order = 0, }; struct shrink_control shrink = { @@ -3102,7 +3100,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, .order = order, }; struct shrink_control shrink = { -- cgit v1.2.3 From bb2a0de92c891b8feeedc0178acb3ae009d899a8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:22 -0700 Subject: memcg: consolidate memory cgroup lru stat functions In mm/memcontrol.c, there are many lru stat functions as.. mem_cgroup_zone_nr_lru_pages mem_cgroup_node_nr_file_lru_pages mem_cgroup_nr_file_lru_pages mem_cgroup_node_nr_anon_lru_pages mem_cgroup_nr_anon_lru_pages mem_cgroup_node_nr_unevictable_lru_pages mem_cgroup_nr_unevictable_lru_pages mem_cgroup_node_nr_lru_pages mem_cgroup_nr_lru_pages mem_cgroup_get_local_zonestat Some of them are under #ifdef MAX_NUMNODES >1 and others are not. This seems bad. This patch consolidates all functions into mem_cgroup_zone_nr_lru_pages() mem_cgroup_node_nr_lru_pages() mem_cgroup_nr_lru_pages() For these functions, "which LRU?" information is passed by a mask. example: mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)) And I added some macro as ALL_LRU, ALL_LRU_FILE, ALL_LRU_ANON. example: mem_cgroup_nr_lru_pages(mem, ALL_LRU) BTW, considering layout of NUMA memory placement of counters, this patch seems to be better. Now, when we gather all LRU information, we scan in following orer for_each_lru -> for_each_node -> for_each_zone. This means we'll touch cache lines in different node in turn. After patch, we'll scan for_each_node -> for_each_zone -> for_each_lru(mask) Then, we'll gather information in the same cacheline at once. [akpm@linux-foundation.org: fix warnigns, build error] Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Michal Hocko Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +- include/linux/mmzone.h | 6 ++ mm/memcontrol.c | 176 +++++++++++++-------------------------------- mm/vmscan.c | 3 +- 4 files changed, 60 insertions(+), 132 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 50940da6adf3..affd5b19b86c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,8 +111,7 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru); + int nid, int zid, unsigned int lrumask); struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone); struct zone_reclaim_stat* @@ -313,8 +312,8 @@ mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) } static inline unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, struct zone *zone, - enum lru_list lru) +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, + unsigned int lru_mask) { return 0; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9f7c3ebcbbad..0a2d3d620feb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -158,6 +158,12 @@ static inline int is_unevictable_lru(enum lru_list l) return (l == LRU_UNEVICTABLE); } +/* Mask used at gathering information at once (see memcontrol.c) */ +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + enum zone_watermarks { WMARK_MIN, WMARK_LOW, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 506d116a7d33..85599662bd90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -636,27 +636,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } -static unsigned long -mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +unsigned long +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, + unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; + enum lru_list l; + unsigned long ret = 0; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + + for_each_lru(l) { + if (BIT(l) & lru_mask) + ret += MEM_CGROUP_ZSTAT(mz, l); + } + return ret; +} + +static unsigned long +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, + int nid, unsigned int lru_mask) +{ u64 total = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + for (zid = 0; zid < MAX_NR_ZONES; zid++) + total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); + return total; } -static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, - enum lru_list idx) + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, + unsigned int lru_mask) { int nid; u64 total = 0; - for_each_online_node(nid) - total += mem_cgroup_get_zonestat_node(mem, nid, idx); + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); return total; } @@ -1077,8 +1094,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1117,109 +1134,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) unsigned long active; unsigned long inactive; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); return (active > inactive); } -unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return MEM_CGROUP_ZSTAT(mz, lru); -} - -static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); - - return ret; -} - -static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); - return ret; -} - -#if MAX_NUMNODES > 1 -static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); - - return total; -} - -static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); - - return total; -} - -static unsigned long -mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) -{ - return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); -} - -static unsigned long -mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); - - return total; -} - -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - enum lru_list l; - u64 total = 0; - - for_each_lru(l) - total += mem_cgroup_get_zonestat_node(memcg, nid, l); - - return total; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid); - - return total; -} -#endif /* CONFIG_NUMA */ - struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1576,11 +1496,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, int nid, bool noswap) { - if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) return true; return false; @@ -4151,15 +4071,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ - val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; } @@ -4181,35 +4101,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); - total_nr = mem_cgroup_nr_lru_pages(mem_cont); + total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); seq_printf(m, "total=%lu", total_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); + file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); seq_printf(m, "file=%lu", file_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + LRU_ALL_FILE); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); + anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); seq_printf(m, "anon=%lu", anon_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + LRU_ALL_ANON); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); + unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); seq_printf(m, "unevictable=%lu", unevictable_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, - nid); + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + BIT(LRU_UNEVICTABLE)); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); diff --git a/mm/vmscan.c b/mm/vmscan.c index 05637491f244..91cee9dfc501 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -171,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); + return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, + zone_to_nid(zone), zone_idx(zone), BIT(lru)); return zone_page_state(zone, NR_LRU_BASE + lru); } -- cgit v1.2.3 From 79dfdaccd1d5b40ff7cf4a35a0e63696ebb78b4d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:23 -0700 Subject: memcg: make oom_lock 0 and 1 based rather than counter Commit 867578cb ("memcg: fix oom kill behavior") introduced a oom_lock counter which is incremented by mem_cgroup_oom_lock when we are about to handle memcg OOM situation. mem_cgroup_handle_oom falls back to a sleep if oom_lock > 1 to prevent from multiple oom kills at the same time. The counter is then decremented by mem_cgroup_oom_unlock called from the same function. This works correctly but it can lead to serious starvations when we have many processes triggering OOM and many CPUs available for them (I have tested with 16 CPUs). Consider a process (call it A) which gets the oom_lock (the first one that got to mem_cgroup_handle_oom and grabbed memcg_oom_mutex) and other processes that are blocked on the mutex. While A releases the mutex and calls mem_cgroup_out_of_memory others will wake up (one after another) and increase the counter and fall into sleep (memcg_oom_waitq). Once A finishes mem_cgroup_out_of_memory it takes the mutex again and decreases oom_lock and wakes other tasks (if releasing memory by somebody else - e.g. killed process - hasn't done it yet). A testcase would look like: Assume malloc XXX is a program allocating XXX Megabytes of memory which touches all allocated pages in a tight loop # swapoff SWAP_DEVICE # cgcreate -g memory:A # cgset -r memory.oom_control=0 A # cgset -r memory.limit_in_bytes= 200M # for i in `seq 100` # do # cgexec -g memory:A malloc 10 & # done The main problem here is that all processes still race for the mutex and there is no guarantee that we will get counter back to 0 for those that got back to mem_cgroup_handle_oom. In the end the whole convoy in/decreases the counter but we do not get to 1 that would enable killing so nothing useful can be done. The time is basically unbounded because it highly depends on scheduling and ordering on mutex (I have seen this taking hours...). This patch replaces the counter by a simple {un}lock semantic. As mem_cgroup_oom_{un}lock works on the a subtree of a hierarchy we have to make sure that nobody else races with us which is guaranteed by the memcg_oom_mutex. We have to be careful while locking subtrees because we can encounter a subtree which is already locked: hierarchy: A / \ B \ /\ \ C D E B - C - D tree might be already locked. While we want to enable locking E subtree because OOM situations cannot influence each other we definitely do not want to allow locking A. Therefore we have to refuse lock if any subtree is already locked and clear up the lock for all nodes that have been set up to the failure point. On the other hand we have to make sure that the rest of the world will recognize that a group is under OOM even though it doesn't have a lock. Therefore we have to introduce under_oom variable which is incremented and decremented for the whole subtree when we enter resp. leave mem_cgroup_handle_oom. under_oom, unlike oom_lock, doesn't need be updated under memcg_oom_mutex because its users only check a single group and they use atomic operations for that. This can be checked easily by the following test case: # cgcreate -g memory:A # cgset -r memory.use_hierarchy=1 A # cgset -r memory.oom_control=1 A # cgset -r memory.limit_in_bytes= 100M # cgset -r memory.memsw.limit_in_bytes= 100M # cgcreate -g memory:A/B # cgset -r memory.oom_control=1 A/B # cgset -r memory.limit_in_bytes=20M # cgset -r memory.memsw.limit_in_bytes=20M # cgexec -g memory:A/B malloc 30 & #->this will be blocked by OOM of group B # cgexec -g memory:A malloc 80 & #->this will be blocked by OOM of group A While B gets oom_lock A will not get it. Both of them go into sleep and wait for an external action. We can make the limit higher for A to enforce waking it up # cgset -r memory.memsw.limit_in_bytes=300M A # cgset -r memory.limit_in_bytes=300M A malloc in A has to wake up even though it doesn't have oom_lock. Finally, the unlock path is very easy because we always unlock only the subtree we have locked previously while we always decrement under_oom. Signed-off-by: Michal Hocko Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85599662bd90..95d6c256b54c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -246,7 +246,10 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - atomic_t oom_lock; + + bool oom_lock; + atomic_t under_oom; + atomic_t refcnt; int swappiness; @@ -1722,37 +1725,83 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. + * Has to be called with memcg_oom_mutex */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int x, lock_count = 0; - struct mem_cgroup *iter; + int lock_count = -1; + struct mem_cgroup *iter, *failed = NULL; + bool cond = true; - for_each_mem_cgroup_tree(iter, mem) { - x = atomic_inc_return(&iter->oom_lock); - lock_count = max(x, lock_count); + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + bool locked = iter->oom_lock; + + iter->oom_lock = true; + if (lock_count == -1) + lock_count = iter->oom_lock; + else if (lock_count != locked) { + /* + * this subtree of our hierarchy is already locked + * so we cannot give a lock. + */ + lock_count = 0; + failed = iter; + cond = false; + } } - if (lock_count == 1) - return true; - return false; + if (!failed) + goto done; + + /* + * OK, we failed to lock the whole subtree so we have to clean up + * what we set up to the failing subtree + */ + cond = true; + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + if (iter == failed) { + cond = false; + continue; + } + iter->oom_lock = false; + } +done: + return lock_count; } +/* + * Has to be called with memcg_oom_mutex + */ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { struct mem_cgroup *iter; + for_each_mem_cgroup_tree(iter, mem) + iter->oom_lock = false; + return 0; +} + +static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + atomic_inc(&iter->under_oom); +} + +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + /* * When a new child is created while the hierarchy is under oom, * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ for_each_mem_cgroup_tree(iter, mem) - atomic_add_unless(&iter->oom_lock, -1, 0); - return 0; + atomic_add_unless(&iter->under_oom, -1, 0); } - static DEFINE_MUTEX(memcg_oom_mutex); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1794,7 +1843,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) static void memcg_oom_recover(struct mem_cgroup *mem) { - if (mem && atomic_read(&mem->oom_lock)) + if (mem && atomic_read(&mem->under_oom)) memcg_wakeup_oom(mem); } @@ -1812,6 +1861,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); need_to_kill = true; + mem_cgroup_mark_under_oom(mem); + /* At first, try to OOM lock hierarchy under mem.*/ mutex_lock(&memcg_oom_mutex); locked = mem_cgroup_oom_lock(mem); @@ -1835,10 +1886,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) finish_wait(&memcg_oom_waitq, &owait.wait); } mutex_lock(&memcg_oom_mutex); - mem_cgroup_oom_unlock(mem); + if (locked) + mem_cgroup_oom_unlock(mem); memcg_wakeup_oom(mem); mutex_unlock(&memcg_oom_mutex); + mem_cgroup_unmark_under_oom(mem); + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; /* Give chance to dying process */ @@ -4505,7 +4559,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->oom_lock)) + if (atomic_read(&memcg->under_oom)) eventfd_signal(eventfd, 1); mutex_unlock(&memcg_oom_mutex); @@ -4540,7 +4594,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); - if (atomic_read(&mem->oom_lock)) + if (atomic_read(&mem->under_oom)) cb->fill(cb, "under_oom", 1); else cb->fill(cb, "under_oom", 0); -- cgit v1.2.3 From 1af8efe965676ab30d6c8a5b1fccc9229f339a3b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:24 -0700 Subject: memcg: change memcg_oom_mutex to spinlock memcg_oom_mutex is used to protect memcg OOM path and eventfd interface for oom_control. None of the critical sections which it protects sleep (eventfd_signal works from atomic context and the rest are simple linked list resp. oom_lock atomic operations). Mutex is also too heavyweight for those code paths because it triggers a lot of scheduling. It also makes makes convoying effects more visible when we have a big number of oom killing because we take the lock mutliple times during mem_cgroup_handle_oom so we have multiple places where many processes can sleep. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 95d6c256b54c..c0b065ec1571 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1725,7 +1725,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_mutex + * Has to be called with memcg_oom_lock */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { @@ -1770,7 +1770,7 @@ done: } /* - * Has to be called with memcg_oom_mutex + * Has to be called with memcg_oom_lock */ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { @@ -1802,7 +1802,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) atomic_add_unless(&iter->under_oom, -1, 0); } -static DEFINE_MUTEX(memcg_oom_mutex); +static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -1864,7 +1864,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) mem_cgroup_mark_under_oom(mem); /* At first, try to OOM lock hierarchy under mem.*/ - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); locked = mem_cgroup_oom_lock(mem); /* * Even if signal_pending(), we can't quit charge() loop without @@ -1876,7 +1876,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) need_to_kill = false; if (locked) mem_cgroup_oom_notify(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); @@ -1885,11 +1885,11 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); if (locked) mem_cgroup_oom_unlock(mem); memcg_wakeup_oom(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); mem_cgroup_unmark_under_oom(mem); @@ -4553,7 +4553,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, if (!event) return -ENOMEM; - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); event->eventfd = eventfd; list_add(&event->list, &memcg->oom_notify); @@ -4561,7 +4561,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, /* already in OOM ? */ if (atomic_read(&memcg->under_oom)) eventfd_signal(eventfd, 1); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); return 0; } @@ -4575,7 +4575,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, BUG_ON(type != _OOM_TYPE); - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { if (ev->eventfd == eventfd) { @@ -4584,7 +4584,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, } } - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); } static int mem_cgroup_oom_control_read(struct cgroup *cgrp, -- cgit v1.2.3 From 4508378b9523e22a2a0175d8bf64d932fb10a67d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:24 -0700 Subject: memcg: fix vmscan count in small memcgs Commit 246e87a93934 ("memcg: fix get_scan_count() for small targets") fixes the memcg/kswapd behavior against small targets and prevent vmscan priority too high. But the implementation is too naive and adds another problem to small memcg. It always force scan to 32 pages of file/anon and doesn't handle swappiness and other rotate_info. It makes vmscan to scan anon LRU regardless of swappiness and make reclaim bad. This patch fixes it by adjusting scanning count with regard to swappiness at el. At a test "cat 1G file under 300M limit." (swappiness=20) before patch scanned_pages_by_limit 360919 scanned_anon_pages_by_limit 180469 scanned_file_pages_by_limit 180450 rotated_pages_by_limit 31 rotated_anon_pages_by_limit 25 rotated_file_pages_by_limit 6 freed_pages_by_limit 180458 freed_anon_pages_by_limit 19 freed_file_pages_by_limit 180439 elapsed_ns_by_limit 429758872 after patch scanned_pages_by_limit 180674 scanned_anon_pages_by_limit 24 scanned_file_pages_by_limit 180650 rotated_pages_by_limit 35 rotated_anon_pages_by_limit 24 rotated_file_pages_by_limit 11 freed_pages_by_limit 180634 freed_anon_pages_by_limit 0 freed_file_pages_by_limit 180634 elapsed_ns_by_limit 367119089 scanned_pages_by_system 0 the numbers of scanning anon are decreased(as expected), and elapsed time reduced. By this patch, small memcgs will work better. (*) Because the amount of file-cache is much bigger than anon, recalaim_stat's rotate-scan counter make scanning files more. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 91cee9dfc501..f87702a376d0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1795,6 +1795,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, enum lru_list l; int noswap = 0; int force_scan = 0; + unsigned long nr_force_scan[2]; anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + @@ -1817,6 +1818,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; + nr_force_scan[0] = 0; + nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } @@ -1828,6 +1831,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; + nr_force_scan[0] = SWAP_CLUSTER_MAX; + nr_force_scan[1] = 0; goto out; } } @@ -1876,6 +1881,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; + if (force_scan) { + unsigned long scan = SWAP_CLUSTER_MAX; + nr_force_scan[0] = div64_u64(scan * ap, denominator); + nr_force_scan[1] = div64_u64(scan * fp, denominator); + } out: for_each_evictable_lru(l) { int file = is_file_lru(l); @@ -1896,12 +1906,8 @@ out: * memcg, priority drop can cause big latency. So, it's better * to scan small amount. See may_noscan above. */ - if (!scan && force_scan) { - if (file) - scan = SWAP_CLUSTER_MAX; - else if (!noswap) - scan = SWAP_CLUSTER_MAX; - } + if (!scan && force_scan) + scan = nr_force_scan[file]; nr[l] = scan; } } -- cgit v1.2.3 From 108b6a78463bb8c7163e4f9779f36ad8bbade334 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 26 Jul 2011 16:08:25 -0700 Subject: memcg: fix behavior of mem_cgroup_resize_limit() Commit 22a668d7c3ef ("memcg: fix behavior under memory.limit equals to memsw.limit") introduced "memsw_is_minimum" flag, which becomes true when mem_limit == memsw_limit. The flag is checked at the beginning of reclaim, and "noswap" is set if the flag is true, because using swap is meaningless in this case. This works well in most cases, but when we try to shrink mem_limit, which is the same as memsw_limit now, we might fail to shrink mem_limit because swap doesn't used. This patch fixes this behavior by: - check MEM_CGROUP_RECLAIM_SHRINK at the begining of reclaim - If it is set, don't set "noswap" flag even if memsw_is_minimum is true. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Ying Han Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0b065ec1571..dfeca594fd7a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1653,7 +1653,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && root_mem->memsw_is_minimum) + if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; while (1) { -- cgit v1.2.3 From 82f9d486e59f588c7d100865c36510644abda356 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:26 -0700 Subject: memcg: add memory.vmscan_stat The commit log of 0ae5e89c60c9 ("memcg: count the soft_limit reclaim in...") says it adds scanning stats to memory.stat file. But it doesn't because we considered we needed to make a concensus for such new APIs. This patch is a trial to add memory.scan_stat. This shows - the number of scanned pages(total, anon, file) - the number of rotated pages(total, anon, file) - the number of freed pages(total, anon, file) - the number of elaplsed time (including sleep/pause time) for both of direct/soft reclaim. The biggest difference with oringinal Ying's one is that this file can be reset by some write, as # echo 0 ...../memory.scan_stat Example of output is here. This is a result after make -j 6 kernel under 300M limit. [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.scan_stat [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.vmscan_stat scanned_pages_by_limit 9471864 scanned_anon_pages_by_limit 6640629 scanned_file_pages_by_limit 2831235 rotated_pages_by_limit 4243974 rotated_anon_pages_by_limit 3971968 rotated_file_pages_by_limit 272006 freed_pages_by_limit 2318492 freed_anon_pages_by_limit 962052 freed_file_pages_by_limit 1356440 elapsed_ns_by_limit 351386416101 scanned_pages_by_system 0 scanned_anon_pages_by_system 0 scanned_file_pages_by_system 0 rotated_pages_by_system 0 rotated_anon_pages_by_system 0 rotated_file_pages_by_system 0 freed_pages_by_system 0 freed_anon_pages_by_system 0 freed_file_pages_by_system 0 elapsed_ns_by_system 0 scanned_pages_by_limit_under_hierarchy 9471864 scanned_anon_pages_by_limit_under_hierarchy 6640629 scanned_file_pages_by_limit_under_hierarchy 2831235 rotated_pages_by_limit_under_hierarchy 4243974 rotated_anon_pages_by_limit_under_hierarchy 3971968 rotated_file_pages_by_limit_under_hierarchy 272006 freed_pages_by_limit_under_hierarchy 2318492 freed_anon_pages_by_limit_under_hierarchy 962052 freed_file_pages_by_limit_under_hierarchy 1356440 elapsed_ns_by_limit_under_hierarchy 351386416101 scanned_pages_by_system_under_hierarchy 0 scanned_anon_pages_by_system_under_hierarchy 0 scanned_file_pages_by_system_under_hierarchy 0 rotated_pages_by_system_under_hierarchy 0 rotated_anon_pages_by_system_under_hierarchy 0 rotated_file_pages_by_system_under_hierarchy 0 freed_pages_by_system_under_hierarchy 0 freed_anon_pages_by_system_under_hierarchy 0 freed_file_pages_by_system_under_hierarchy 0 elapsed_ns_by_system_under_hierarchy 0 total_xxxx is for hierarchy management. This will be useful for further memcg developments and need to be developped before we do some complicated rework on LRU/softlimit management. This patch adds a new struct memcg_scanrecord into scan_control struct. sc->nr_scanned at el is not designed for exporting information. For example, nr_scanned is reset frequentrly and incremented +2 at scanning mapped pages. To avoid complexity, I added a new param in scan_control which is for exporting scanning score. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Cc: Andrew Bresticker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 85 ++++++++++++++++++- include/linux/memcontrol.h | 19 +++++ include/linux/swap.h | 6 -- mm/memcontrol.c | 172 +++++++++++++++++++++++++++++++++++++-- mm/vmscan.c | 39 +++++++-- 5 files changed, 303 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 06eb6d957c83..6f3c598971fc 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -380,7 +380,7 @@ will be charged as a new owner of it. 5.2 stat file -memory.stat file includes following statistics +5.2.1 memory.stat file includes following statistics # per-memory cgroup local status cache - # of bytes of page cache memory. @@ -438,6 +438,89 @@ Note: file_mapped is accounted only when the memory cgroup is owner of page cache.) +5.2.2 memory.vmscan_stat + +memory.vmscan_stat includes statistics information for memory scanning and +freeing, reclaiming. The statistics shows memory scanning information since +memory cgroup creation and can be reset to 0 by writing 0 as + + #echo 0 > ../memory.vmscan_stat + +This file contains following statistics. + +[param]_[file_or_anon]_pages_by_[reason]_[under_heararchy] +[param]_elapsed_ns_by_[reason]_[under_hierarchy] + +For example, + + scanned_file_pages_by_limit indicates the number of scanned + file pages at vmscan. + +Now, 3 parameters are supported + + scanned - the number of pages scanned by vmscan + rotated - the number of pages activated at vmscan + freed - the number of pages freed by vmscan + +If "rotated" is high against scanned/freed, the memcg seems busy. + +Now, 2 reason are supported + + limit - the memory cgroup's limit + system - global memory pressure + softlimit + (global memory pressure not under softlimit is not handled now) + +When under_hierarchy is added in the tail, the number indicates the +total memcg scan of its children and itself. + +elapsed_ns is a elapsed time in nanosecond. This may include sleep time +and not indicates CPU usage. So, please take this as just showing +latency. + +Here is an example. + +# cat /cgroup/memory/A/memory.vmscan_stat +scanned_pages_by_limit 9471864 +scanned_anon_pages_by_limit 6640629 +scanned_file_pages_by_limit 2831235 +rotated_pages_by_limit 4243974 +rotated_anon_pages_by_limit 3971968 +rotated_file_pages_by_limit 272006 +freed_pages_by_limit 2318492 +freed_anon_pages_by_limit 962052 +freed_file_pages_by_limit 1356440 +elapsed_ns_by_limit 351386416101 +scanned_pages_by_system 0 +scanned_anon_pages_by_system 0 +scanned_file_pages_by_system 0 +rotated_pages_by_system 0 +rotated_anon_pages_by_system 0 +rotated_file_pages_by_system 0 +freed_pages_by_system 0 +freed_anon_pages_by_system 0 +freed_file_pages_by_system 0 +elapsed_ns_by_system 0 +scanned_pages_by_limit_under_hierarchy 9471864 +scanned_anon_pages_by_limit_under_hierarchy 6640629 +scanned_file_pages_by_limit_under_hierarchy 2831235 +rotated_pages_by_limit_under_hierarchy 4243974 +rotated_anon_pages_by_limit_under_hierarchy 3971968 +rotated_file_pages_by_limit_under_hierarchy 272006 +freed_pages_by_limit_under_hierarchy 2318492 +freed_anon_pages_by_limit_under_hierarchy 962052 +freed_file_pages_by_limit_under_hierarchy 1356440 +elapsed_ns_by_limit_under_hierarchy 351386416101 +scanned_pages_by_system_under_hierarchy 0 +scanned_anon_pages_by_system_under_hierarchy 0 +scanned_file_pages_by_system_under_hierarchy 0 +rotated_pages_by_system_under_hierarchy 0 +rotated_anon_pages_by_system_under_hierarchy 0 +rotated_file_pages_by_system_under_hierarchy 0 +freed_pages_by_system_under_hierarchy 0 +freed_anon_pages_by_system_under_hierarchy 0 +freed_file_pages_by_system_under_hierarchy 0 +elapsed_ns_by_system_under_hierarchy 0 + 5.3 swappiness Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index affd5b19b86c..b96600786913 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -39,6 +39,16 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct mem_cgroup *mem_cont, int active, int file); +struct memcg_scanrecord { + struct mem_cgroup *mem; /* scanend memory cgroup */ + struct mem_cgroup *root; /* scan target hierarchy root */ + int context; /* scanning context (see memcontrol.c) */ + unsigned long nr_scanned[2]; /* the number of scanned pages */ + unsigned long nr_rotated[2]; /* the number of rotated pages */ + unsigned long nr_freed[2]; /* the number of freed pages */ + unsigned long elapsed; /* nsec of time elapsed while scanning */ +}; + #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -119,6 +129,15 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + struct memcg_scanrecord *rec); +extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + struct zone *zone, + struct memcg_scanrecord *rec, + unsigned long *nr_scanned); + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 44558b600ee3..91d5fcc83116 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -251,12 +251,6 @@ static inline void lru_cache_add_file(struct page *page) /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); -extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap); -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - unsigned long *nr_scanned); extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfeca594fd7a..04e505bfd7dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +enum { + SCAN_BY_LIMIT, + SCAN_BY_SYSTEM, + NR_SCAN_CONTEXT, + SCAN_BY_SHRINK, /* not recorded now */ +}; + +enum { + SCAN, + SCAN_ANON, + SCAN_FILE, + ROTATE, + ROTATE_ANON, + ROTATE_FILE, + FREED, + FREED_ANON, + FREED_FILE, + ELAPSED, + NR_SCANSTATS, +}; + +struct scanstat { + spinlock_t lock; + unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; + unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; +}; + +const char *scanstat_string[NR_SCANSTATS] = { + "scanned_pages", + "scanned_anon_pages", + "scanned_file_pages", + "rotated_pages", + "rotated_anon_pages", + "rotated_file_pages", + "freed_pages", + "freed_anon_pages", + "freed_file_pages", + "elapsed_ns", +}; +#define SCANSTAT_WORD_LIMIT "_by_limit" +#define SCANSTAT_WORD_SYSTEM "_by_system" +#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" + + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -270,7 +314,8 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - + /* For recording LRU-scan statistics */ + struct scanstat scanstat; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -1623,6 +1668,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) } #endif +static void __mem_cgroup_record_scanstat(unsigned long *stats, + struct memcg_scanrecord *rec) +{ + + stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; + stats[SCAN_ANON] += rec->nr_scanned[0]; + stats[SCAN_FILE] += rec->nr_scanned[1]; + + stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; + stats[ROTATE_ANON] += rec->nr_rotated[0]; + stats[ROTATE_FILE] += rec->nr_rotated[1]; + + stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; + stats[FREED_ANON] += rec->nr_freed[0]; + stats[FREED_FILE] += rec->nr_freed[1]; + + stats[ELAPSED] += rec->elapsed; +} + +static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) +{ + struct mem_cgroup *mem; + int context = rec->context; + + if (context >= NR_SCAN_CONTEXT) + return; + + mem = rec->mem; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); + spin_unlock(&mem->scanstat.lock); + + mem = rec->root; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); + spin_unlock(&mem->scanstat.lock); +} + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1647,8 +1730,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + struct memcg_scanrecord rec; unsigned long excess; - unsigned long nr_scanned; + unsigned long scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1656,6 +1740,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; + if (shrink) + rec.context = SCAN_BY_SHRINK; + else if (check_soft) + rec.context = SCAN_BY_SYSTEM; + else + rec.context = SCAN_BY_LIMIT; + + rec.root = root_mem; + while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { @@ -1696,14 +1789,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } + rec.mem = victim; + rec.nr_scanned[0] = 0; + rec.nr_scanned[1] = 0; + rec.nr_rotated[0] = 0; + rec.nr_rotated[1] = 0; + rec.nr_freed[0] = 0; + rec.nr_freed[1] = 0; + rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; + noswap, zone, &rec, &scanned); + *total_scanned += scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); + noswap, &rec); + mem_cgroup_record_scanstat(&rec); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3792,14 +3894,18 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { + struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } + rec.context = SCAN_BY_SHRINK; + rec.mem = mem; + rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false); + false, &rec); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4643,6 +4749,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, + struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + char string[64]; + int i; + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); + } + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); + } + return 0; +} + +static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, + unsigned int event) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + spin_lock(&mem->scanstat.lock); + memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); + memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); + spin_unlock(&mem->scanstat.lock); + return 0; +} + + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4713,6 +4867,11 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif + { + .name = "vmscan_stat", + .read_map = mem_cgroup_vmscan_stat_read, + .trigger = mem_cgroup_reset_vmscan_stat, + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4976,6 +5135,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); + spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); diff --git a/mm/vmscan.c b/mm/vmscan.c index f87702a376d0..7ef69124fa3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -105,6 +105,7 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -1348,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1391,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; + if (!scanning_global_lru(sc)) { + sc->memcg_record->nr_scanned[0] += *nr_anon; + sc->memcg_record->nr_scanned[1] += *nr_file; + } } /* @@ -1504,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_freed[file] += nr_reclaimed; + local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1603,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1654,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -2254,9 +2268,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - unsigned long *nr_scanned) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + struct memcg_scanrecord *rec, + unsigned long *scanned) { struct scan_control sc = { .nr_scanned = 0, @@ -2266,7 +2281,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_swap = !noswap, .order = 0, .mem_cgroup = mem, + .memcg_record = rec, }; + unsigned long start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2275,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); + start = sched_clock(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2283,19 +2301,25 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + end = sched_clock(); + + if (rec) + rec->elapsed += end - start; + *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); - *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap) + bool noswap, + struct memcg_scanrecord *rec) { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2304,6 +2328,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, .mem_cgroup = mem_cont, + .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2312,6 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; + start = sched_clock(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2326,6 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + end = sched_clock(); + if (rec) + rec->elapsed += end - start; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); -- cgit v1.2.3 From d1a05b6973c7cb33144fa965d73facc708ffc37d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:27 -0700 Subject: memcg: do not try to drain per-cpu caches without pages drain_all_stock_async tries to optimize a work to be done on the work queue by excluding any work for the current CPU because it assumes that the context we are called from already tried to charge from that cache and it's failed so it must be empty already. While the assumption is correct we can optimize it even more by checking the current number of pages in the cache. This will also reduce a work on other CPUs with an empty stock. For the current CPU we can simply call drain_local_stock rather than deferring it to the work queue. [kamezawa.hiroyu@jp.fujitsu.com: use drain_local_stock for current CPU optimization] Signed-off-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04e505bfd7dd..2f5534e1968c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2180,11 +2180,8 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem) struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *mem; - if (cpu == curcpu) - continue; - mem = stock->cached; - if (!mem) + if (!mem || !stock->nr_pages) continue; if (mem != root_mem) { if (!root_mem->use_hierarchy) @@ -2193,8 +2190,12 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem) if (!css_is_ancestor(&mem->css, &root_mem->css)) continue; } - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) - schedule_work_on(cpu, &stock->work); + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (cpu == curcpu) + drain_local_stock(&stock->work); + else + schedule_work_on(cpu, &stock->work); + } } put_online_cpus(); mutex_unlock(&percpu_charge_mutex); -- cgit v1.2.3 From d38144b7a5f8d0a5e05d549177191374c6911009 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:28 -0700 Subject: memcg: unify sync and async per-cpu charge cache draining Currently we have two ways how to drain per-CPU caches for charges. drain_all_stock_sync will synchronously drain all caches while drain_all_stock_async will asynchronously drain only those that refer to a given memory cgroup or its subtree in hierarchy. Targeted async draining has been introduced by 26fe6168 (memcg: fix percpu cached charge draining frequency) to reduce the cpu workers number. sync draining is currently triggered only from mem_cgroup_force_empty which is triggered only by userspace (mem_cgroup_force_empty_write) or when a cgroup is removed (mem_cgroup_pre_destroy). Although these are not usually frequent operations it still makes some sense to do targeted draining as well, especially if the box has many CPUs. This patch unifies both methods to use the single code (drain_all_stock) which relies on the original async implementation and just adds flush_work to wait on all caches that are still under work for the sync mode. We are using FLUSHING_CACHED_CHARGE bit check to prevent from waiting on a work that we haven't triggered. Please note that both sync and async functions are currently protected by percpu_charge_mutex so we cannot race with other drainers. Signed-off-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f5534e1968c..af920d0f9025 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2154,19 +2154,14 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) } /* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. + * Drains all per-CPU charge caches for given root_mem resp. subtree + * of the hierarchy under it. sync flag says whether we should block + * until the work is done. */ -static void drain_all_stock_async(struct mem_cgroup *root_mem) +static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) { int cpu, curcpu; - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; + /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); /* @@ -2197,17 +2192,42 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem) schedule_work_on(cpu, &stock->work); } } + + if (!sync) + goto out; + + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + flush_work(&stock->work); + } +out: put_online_cpus(); +} + +/* + * Tries to drain stocked charges in other cpus. This function is asynchronous + * and just put a work per cpu for draining localy on each cpu. Caller can + * expects some charges will be back to res_counter later but cannot wait for + * it. + */ +static void drain_all_stock_async(struct mem_cgroup *root_mem) +{ + /* + * If someone calls draining, avoid adding more kworker runs. + */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; + drain_all_stock(root_mem, false); mutex_unlock(&percpu_charge_mutex); - /* We don't wait for flush_work */ } /* This is a synchronous drain interface. */ -static void drain_all_stock_sync(void) +static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ mutex_lock(&percpu_charge_mutex); - schedule_on_each_cpu(drain_local_stock); + drain_all_stock(root_mem, true); mutex_unlock(&percpu_charge_mutex); } @@ -3856,7 +3876,7 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); - drain_all_stock_sync(); + drain_all_stock_sync(mem); ret = 0; mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { -- cgit v1.2.3 From 3e92041d68b40c47faa34c7dc08fc650a6c36adc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:29 -0700 Subject: memcg: add mem_cgroup_same_or_subtree() helper We are checking whether a given two groups are same or at least in the same subtree of a hierarchy at several places. Let's make a helper for it to make code easier to read. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index af920d0f9025..79f23a189941 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1108,6 +1108,21 @@ void mem_cgroup_move_lists(struct page *page, mem_cgroup_add_lru_list(page, to); } +/* + * Checks whether given mem is same or in the root_mem's + * hierarchy subtree + */ +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, + struct mem_cgroup *mem) +{ + if (root_mem != mem) { + return (root_mem->use_hierarchy && + css_is_ancestor(&mem->css, &root_mem->css)); + } + + return true; +} + int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; @@ -1127,10 +1142,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) * enabled in "curr" and "curr" is a child of "mem" in *cgroup* * hierarchy(even if use_hierarchy is disabled in "mem"). */ - if (mem->use_hierarchy) - ret = css_is_ancestor(&curr->css, &mem->css); - else - ret = (curr == mem); + ret = mem_cgroup_same_or_subtree(mem, curr); css_put(&curr->css); return ret; } @@ -1369,10 +1381,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) to = mc.to; if (!from) goto unlock; - if (from == mem || to == mem - || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) - || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) - ret = true; + + ret = mem_cgroup_same_or_subtree(mem, from) + || mem_cgroup_same_or_subtree(mem, to); unlock: spin_unlock(&mc.lock); return ret; @@ -1915,25 +1926,20 @@ struct oom_wait_info { static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, + *oom_wait_mem; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); + oom_wait_mem = oom_wait_info->mem; - if (oom_wait_info->mem == wake_mem) - goto wakeup; - /* if no hierarchy, no match */ - if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) - return 0; /* * Both of oom_wait_info->mem and wake_mem are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ - if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && - !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) + && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) return 0; - -wakeup: return autoremove_wake_function(wait, mode, sync, arg); } @@ -2178,13 +2184,8 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) mem = stock->cached; if (!mem || !stock->nr_pages) continue; - if (mem != root_mem) { - if (!root_mem->use_hierarchy) - continue; - /* check whether "mem" is under tree of "root_mem" */ - if (!css_is_ancestor(&mem->css, &root_mem->css)) - continue; - } + if (!mem_cgroup_same_or_subtree(root_mem, mem)) + continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); -- cgit v1.2.3 From 8521fc50d433507a7cdc96bec280f9e5888a54cc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:29 -0700 Subject: memcg: get rid of percpu_charge_mutex lock percpu_charge_mutex protects from multiple simultaneous per-cpu charge caches draining because we might end up having too many work items. At least this was the case until commit 26fe61684449 ("memcg: fix percpu cached charge draining frequency") when we introduced a more targeted draining for async mode. Now that also sync draining is targeted we can safely remove mutex because we will not send more work than the current number of CPUs. FLUSHING_CACHED_CHARGE protects from sending the same work multiple times and stock->nr_pages == 0 protects from pointless sending a work if there is obviously nothing to be done. This is of course racy but we can live with it as the race window is really small (we would have to see FLUSHING_CACHED_CHARGE cleared while nr_pages would be still non-zero). The only remaining place where we can race is synchronous mode when we rely on FLUSHING_CACHED_CHARGE test which might have been set by other drainer on the same group but we should wait in that case as well. Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79f23a189941..5f84d2351ddb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2092,7 +2092,6 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -2199,7 +2198,8 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && + test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) flush_work(&stock->work); } out: @@ -2214,22 +2214,14 @@ out: */ static void drain_all_stock_async(struct mem_cgroup *root_mem) { - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; drain_all_stock(root_mem, false); - mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ - mutex_lock(&percpu_charge_mutex); drain_all_stock(root_mem, true); - mutex_unlock(&percpu_charge_mutex); } /* -- cgit v1.2.3 From 778d3b0ff0654ad7092bf823fd32010066b12365 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:30 -0700 Subject: cpusets: randomize node rotor used in cpuset_mem_spread_node() [ This patch has already been accepted as commit 0ac0c0d0f837 but later reverted (commit 35926ff5fba8) because it itroduced arch specific __node_random which was defined only for x86 code so it broke other archs. This is a followup without any arch specific code. Other than that there are no functional changes.] Some workloads that create a large number of small files tend to assign too many pages to node 0 (multi-node systems). Part of the reason is that the rotor (in cpuset_mem_spread_node()) used to assign nodes starts at node 0 for newly created tasks. This patch changes the rotor to be initialized to a random node number of the cpuset. [akpm@linux-foundation.org: fix layout] [Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration] [mhocko@suse.cz: Make it arch independent] [akpm@linux-foundation.org: fix CONFIG_NUMA=y, MAX_NUMNODES>1 build] Signed-off-by: Jack Steiner Signed-off-by: Lee Schermerhorn Signed-off-by: Michal Hocko Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Pekka Enberg Cc: Paul Menage Cc: Jack Steiner Cc: Robin Holt Cc: David Rientjes Cc: Christoph Lameter Cc: David Rientjes Cc: Jack Steiner Cc: KOSAKI Motohiro Cc: Lee Schermerhorn Cc: Michal Hocko Cc: Paul Menage Cc: Pekka Enberg Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 1 + include/linux/nodemask.h | 13 +++++++++++++ kernel/cpuset.c | 8 ++++++++ kernel/fork.c | 4 ++++ lib/bitmap.c | 2 +- mm/mempolicy.c | 16 ++++++++++++++++ 6 files changed, 43 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index dcafe0bf0005..3bac44cce142 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -144,6 +144,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); +extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); #define BITMAP_LAST_WORD_MASK(nbits) \ ( \ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index dba35e413371..7afc36334d52 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -66,6 +66,8 @@ * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * + * int node_random(mask) Random node with set bit in mask + * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * @@ -430,6 +432,7 @@ static inline void node_set_offline(int nid) node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } + #else static inline int node_state(int node, enum node_states state) @@ -460,6 +463,16 @@ static inline int num_node_state(enum node_states state) #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) + +#endif + +#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) +extern int node_random(const nodemask_t *maskp); +#else +static inline int node_random(const nodemask_t *mask) +{ + return 0; +} #endif #define node_online_map node_states[N_ONLINE] diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b7545c810..f8bc977ccbbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor) int cpuset_mem_spread_node(void) { + if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) + current->cpuset_mem_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } int cpuset_slab_spread_node(void) { + if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) + current->cpuset_slab_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); } diff --git a/kernel/fork.c b/kernel/fork.c index 17bf7c8d6511..e33177edb3bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,6 +1173,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_CPUSETS + p->cpuset_mem_spread_rotor = NUMA_NO_NODE; + p->cpuset_slab_spread_rotor = NUMA_NO_NODE; +#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW diff --git a/lib/bitmap.c b/lib/bitmap.c index 3f3b68199d74..37ef4b048795 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -756,7 +756,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits) * * The bit positions 0 through @bits are valid positions in @buf. */ -static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) +int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) { int pos = 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d25c54e..8b57173c1dd5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,6 +93,7 @@ #include #include +#include #include "internal.h" @@ -1645,6 +1646,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } +/* + * Return the bit number of a random bit set in the nodemask. + * (returns -1 if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = -1; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} + #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) -- cgit v1.2.3 From 7f5ddcc8d3eaccd5e169fda738530f937509645e Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Jul 2011 16:09:02 -0700 Subject: fault-injection: use debugfs_remove_recursive Use debugfs_remove_recursive() to simplify initialization and deinitialization of fault injection debugfs files. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fault-inject.h | 18 +------ lib/fault-inject.c | 115 ++++++++++--------------------------------- mm/failslab.c | 2 +- mm/page_alloc.c | 2 +- 4 files changed, 30 insertions(+), 107 deletions(-) (limited to 'mm') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 7b72328cc8fe..a842db638380 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -27,23 +27,7 @@ struct fault_attr { unsigned long count; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - - struct { - struct dentry *dir; - - struct dentry *probability_file; - struct dentry *interval_file; - struct dentry *times_file; - struct dentry *space_file; - struct dentry *verbose_file; - struct dentry *task_filter_file; - struct dentry *stacktrace_depth_file; - struct dentry *require_start_file; - struct dentry *require_end_file; - struct dentry *reject_start_file; - struct dentry *reject_end_file; - } dentries; - + struct dentry *dir; #endif }; diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 882fd3b6a6ad..2577b121c7c1 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -199,48 +199,7 @@ static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode, void cleanup_fault_attr_dentries(struct fault_attr *attr) { - debugfs_remove(attr->dentries.probability_file); - attr->dentries.probability_file = NULL; - - debugfs_remove(attr->dentries.interval_file); - attr->dentries.interval_file = NULL; - - debugfs_remove(attr->dentries.times_file); - attr->dentries.times_file = NULL; - - debugfs_remove(attr->dentries.space_file); - attr->dentries.space_file = NULL; - - debugfs_remove(attr->dentries.verbose_file); - attr->dentries.verbose_file = NULL; - - debugfs_remove(attr->dentries.task_filter_file); - attr->dentries.task_filter_file = NULL; - -#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER - - debugfs_remove(attr->dentries.stacktrace_depth_file); - attr->dentries.stacktrace_depth_file = NULL; - - debugfs_remove(attr->dentries.require_start_file); - attr->dentries.require_start_file = NULL; - - debugfs_remove(attr->dentries.require_end_file); - attr->dentries.require_end_file = NULL; - - debugfs_remove(attr->dentries.reject_start_file); - attr->dentries.reject_start_file = NULL; - - debugfs_remove(attr->dentries.reject_end_file); - attr->dentries.reject_end_file = NULL; - -#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ - - if (attr->dentries.dir) - WARN_ON(!simple_empty(attr->dentries.dir)); - - debugfs_remove(attr->dentries.dir); - attr->dentries.dir = NULL; + debugfs_remove_recursive(attr->dir); } int init_fault_attr_dentries(struct fault_attr *attr, const char *name) @@ -248,66 +207,46 @@ int init_fault_attr_dentries(struct fault_attr *attr, const char *name) mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; - memset(&attr->dentries, 0, sizeof(attr->dentries)); - dir = debugfs_create_dir(name, NULL); if (!dir) - goto fail; - attr->dentries.dir = dir; - - attr->dentries.probability_file = - debugfs_create_ul("probability", mode, dir, &attr->probability); + return -ENOMEM; - attr->dentries.interval_file = - debugfs_create_ul("interval", mode, dir, &attr->interval); + attr->dir = dir; - attr->dentries.times_file = - debugfs_create_atomic_t("times", mode, dir, &attr->times); - - attr->dentries.space_file = - debugfs_create_atomic_t("space", mode, dir, &attr->space); - - attr->dentries.verbose_file = - debugfs_create_ul("verbose", mode, dir, &attr->verbose); - - attr->dentries.task_filter_file = debugfs_create_bool("task-filter", - mode, dir, &attr->task_filter); - - if (!attr->dentries.probability_file || !attr->dentries.interval_file || - !attr->dentries.times_file || !attr->dentries.space_file || - !attr->dentries.verbose_file || !attr->dentries.task_filter_file) + if (!debugfs_create_ul("probability", mode, dir, &attr->probability)) + goto fail; + if (!debugfs_create_ul("interval", mode, dir, &attr->interval)) + goto fail; + if (!debugfs_create_atomic_t("times", mode, dir, &attr->times)) + goto fail; + if (!debugfs_create_atomic_t("space", mode, dir, &attr->space)) + goto fail; + if (!debugfs_create_ul("verbose", mode, dir, &attr->verbose)) + goto fail; + if (!debugfs_create_bool("task-filter", mode, dir, &attr->task_filter)) goto fail; #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER - attr->dentries.stacktrace_depth_file = - debugfs_create_stacktrace_depth( - "stacktrace-depth", mode, dir, &attr->stacktrace_depth); - - attr->dentries.require_start_file = - debugfs_create_ul("require-start", mode, dir, &attr->require_start); - - attr->dentries.require_end_file = - debugfs_create_ul("require-end", mode, dir, &attr->require_end); - - attr->dentries.reject_start_file = - debugfs_create_ul("reject-start", mode, dir, &attr->reject_start); - - attr->dentries.reject_end_file = - debugfs_create_ul("reject-end", mode, dir, &attr->reject_end); - - if (!attr->dentries.stacktrace_depth_file || - !attr->dentries.require_start_file || - !attr->dentries.require_end_file || - !attr->dentries.reject_start_file || - !attr->dentries.reject_end_file) + if (!debugfs_create_stacktrace_depth("stacktrace-depth", mode, dir, + &attr->stacktrace_depth)) + goto fail; + if (!debugfs_create_ul("require-start", mode, dir, + &attr->require_start)) + goto fail; + if (!debugfs_create_ul("require-end", mode, dir, &attr->require_end)) + goto fail; + if (!debugfs_create_ul("reject-start", mode, dir, &attr->reject_start)) + goto fail; + if (!debugfs_create_ul("reject-end", mode, dir, &attr->reject_end)) goto fail; #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ return 0; fail: - cleanup_fault_attr_dentries(attr); + debugfs_remove_recursive(attr->dir); + return -ENOMEM; } diff --git a/mm/failslab.c b/mm/failslab.c index c5f88f240ddc..7df9f7f0abf1 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -45,7 +45,7 @@ static int __init failslab_debugfs_init(void) err = init_fault_attr_dentries(&failslab.attr, "failslab"); if (err) return err; - dir = failslab.attr.dentries.dir; + dir = failslab.attr.dir; failslab.ignore_gfp_wait_file = debugfs_create_bool("ignore-gfp-wait", mode, dir, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 094472377d81..72c6820a345c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1424,7 +1424,7 @@ static int __init fail_page_alloc_debugfs(void) "fail_page_alloc"); if (err) return err; - dir = fail_page_alloc.attr.dentries.dir; + dir = fail_page_alloc.attr.dir; fail_page_alloc.ignore_gfp_wait_file = debugfs_create_bool("ignore-gfp-wait", mode, dir, -- cgit v1.2.3 From 810f09b87b75d7cc3906ffffe4311003f37caa2a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Jul 2011 16:09:02 -0700 Subject: failslab: simplify debugfs initialization Now cleanup_fault_attr_dentries() recursively removes a directory, So we can simplify the error handling in the initialization code and no need to hold dentry structs for each debugfs file. Signed-off-by: Akinobu Mita Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/failslab.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/failslab.c b/mm/failslab.c index 7df9f7f0abf1..1ce58c201dca 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -5,10 +5,6 @@ static struct { struct fault_attr attr; u32 ignore_gfp_wait; int cache_filter; -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - struct dentry *ignore_gfp_wait_file; - struct dentry *cache_filter_file; -#endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, @@ -39,31 +35,24 @@ __setup("failslab=", setup_failslab); static int __init failslab_debugfs_init(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - struct dentry *dir; int err; err = init_fault_attr_dentries(&failslab.attr, "failslab"); if (err) return err; - dir = failslab.attr.dir; - - failslab.ignore_gfp_wait_file = - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait); - failslab.cache_filter_file = - debugfs_create_bool("cache-filter", mode, dir, - &failslab.cache_filter); + if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir, + &failslab.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir, + &failslab.cache_filter)) + goto fail; - if (!failslab.ignore_gfp_wait_file || - !failslab.cache_filter_file) { - err = -ENOMEM; - debugfs_remove(failslab.cache_filter_file); - debugfs_remove(failslab.ignore_gfp_wait_file); - cleanup_fault_attr_dentries(&failslab.attr); - } + return 0; +fail: + cleanup_fault_attr_dentries(&failslab.attr); - return err; + return -ENOMEM; } late_initcall(failslab_debugfs_init); -- cgit v1.2.3 From b2588c4b4c3c075e9b45d61065d86c60de2b6441 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Jul 2011 16:09:03 -0700 Subject: fail_page_alloc: simplify debugfs initialization Now cleanup_fault_attr_dentries() recursively removes a directory, So we can simplify the error handling in the initialization code and no need to hold dentry structs for each debugfs file. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 47 ++++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 72c6820a345c..1dbcf8888f14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1370,21 +1370,12 @@ failed: #ifdef CONFIG_FAIL_PAGE_ALLOC -static struct fail_page_alloc_attr { +static struct { struct fault_attr attr; u32 ignore_gfp_highmem; u32 ignore_gfp_wait; u32 min_order; - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - - struct dentry *ignore_gfp_highmem_file; - struct dentry *ignore_gfp_wait_file; - struct dentry *min_order_file; - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, @@ -1424,30 +1415,24 @@ static int __init fail_page_alloc_debugfs(void) "fail_page_alloc"); if (err) return err; + dir = fail_page_alloc.attr.dir; - fail_page_alloc.ignore_gfp_wait_file = - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait); - - fail_page_alloc.ignore_gfp_highmem_file = - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - fail_page_alloc.min_order_file = - debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order); - - if (!fail_page_alloc.ignore_gfp_wait_file || - !fail_page_alloc.ignore_gfp_highmem_file || - !fail_page_alloc.min_order_file) { - err = -ENOMEM; - debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); - debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); - debugfs_remove(fail_page_alloc.min_order_file); - cleanup_fault_attr_dentries(&fail_page_alloc.attr); - } + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem)) + goto fail; + if (!debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order)) + goto fail; + + return 0; +fail: + cleanup_fault_attr_dentries(&fail_page_alloc.attr); - return err; + return -ENOMEM; } late_initcall(fail_page_alloc_debugfs); -- cgit v1.2.3 From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:06 -0700 Subject: atomic: use This allows us to move duplicated code in (atomic_inc_not_zero() for now) to Signed-off-by: Arun Sharma Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Cc: Eric Dumazet Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/atomic.h | 1 - arch/alpha/include/asm/local.h | 2 +- arch/alpha/kernel/perf_event.c | 2 +- arch/alpha/kernel/smp.c | 2 +- arch/alpha/lib/dec_and_lock.c | 2 +- arch/arm/include/asm/atomic.h | 1 - arch/arm/kernel/smp.c | 2 +- arch/arm/kernel/traps.c | 2 +- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-bcmring/dma.c | 2 +- arch/arm/mach-cns3xxx/include/mach/pm.h | 2 +- arch/arm/mach-cns3xxx/pm.c | 2 +- arch/arm/mach-omap1/pm.c | 2 +- arch/arm/mach-s3c2440/clock.c | 2 +- arch/arm/mach-s3c2440/s3c2442.c | 2 +- arch/arm/mach-s3c2440/s3c244x-clock.c | 2 +- arch/avr32/include/asm/atomic.h | 1 - arch/blackfin/include/asm/atomic.h | 1 - arch/blackfin/include/asm/dma.h | 2 +- arch/blackfin/include/asm/ipipe.h | 2 +- arch/blackfin/include/asm/spinlock.h | 2 +- arch/blackfin/kernel/ftrace.c | 2 +- arch/blackfin/kernel/ipipe.c | 2 +- arch/blackfin/kernel/nmi.c | 2 +- arch/blackfin/mach-common/smp.c | 2 +- arch/cris/arch-v32/drivers/cryptocop.c | 2 +- arch/cris/arch-v32/kernel/smp.c | 2 +- arch/cris/include/asm/atomic.h | 1 - arch/cris/include/asm/bitops.h | 2 +- arch/cris/kernel/process.c | 2 +- arch/frv/include/asm/atomic.h | 1 - arch/frv/include/asm/hardirq.h | 2 +- arch/frv/kernel/irq.c | 2 +- arch/h8300/include/asm/atomic.h | 1 - arch/ia64/include/asm/atomic.h | 1 - arch/ia64/include/asm/processor.h | 2 +- arch/ia64/include/asm/spinlock.h | 2 +- arch/ia64/kernel/smp.c | 2 +- arch/ia64/kernel/smpboot.c | 2 +- arch/ia64/kernel/uncached.c | 2 +- arch/m32r/include/asm/atomic.h | 1 - arch/m32r/include/asm/mmu_context.h | 2 +- arch/m32r/include/asm/spinlock.h | 2 +- arch/m32r/kernel/smp.c | 2 +- arch/m32r/kernel/traps.c | 2 +- arch/m68k/include/asm/atomic.h | 1 - arch/microblaze/include/asm/mmu_context_mm.h | 2 +- arch/microblaze/include/asm/prom.h | 2 +- arch/mips/include/asm/atomic.h | 1 - arch/mips/include/asm/hw_irq.h | 2 +- arch/mips/include/asm/local.h | 2 +- arch/mips/include/asm/smp.h | 2 +- arch/mips/kernel/irq.c | 2 +- arch/mips/kernel/mips-mt.c | 2 +- arch/mips/kernel/rtlx.c | 2 +- arch/mips/kernel/smp-cmp.c | 2 +- arch/mips/kernel/smp-mt.c | 2 +- arch/mips/kernel/smp.c | 2 +- arch/mips/kernel/smtc-proc.c | 2 +- arch/mips/kernel/smtc.c | 2 +- arch/mips/kernel/sync-r4k.c | 2 +- arch/mips/kernel/vpe.c | 2 +- arch/mips/mipssim/sim_smtc.c | 2 +- arch/mips/sgi-ip27/ip27-nmi.c | 2 +- arch/mn10300/include/asm/atomic.h | 1 - arch/mn10300/include/asm/mmu_context.h | 2 +- arch/mn10300/include/asm/spinlock.h | 2 +- arch/mn10300/include/asm/system.h | 2 +- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/mn10300/kernel/traps.c | 2 +- arch/mn10300/mm/misalignment.c | 2 +- arch/mn10300/proc-mn2ws0050/proc-init.c | 2 +- arch/parisc/include/asm/atomic.h | 1 - arch/parisc/include/asm/bitops.h | 2 +- arch/parisc/include/asm/mmu_context.h | 2 +- arch/parisc/kernel/parisc_ksyms.c | 2 +- arch/parisc/kernel/smp.c | 2 +- arch/parisc/kernel/traps.c | 2 +- arch/parisc/lib/bitops.c | 2 +- arch/powerpc/include/asm/atomic.h | 1 - arch/powerpc/include/asm/emulated_ops.h | 2 +- arch/powerpc/include/asm/irq.h | 2 +- arch/powerpc/include/asm/local.h | 2 +- arch/powerpc/include/asm/prom.h | 2 +- arch/powerpc/kernel/of_platform.c | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- arch/powerpc/kernel/rtas.c | 2 +- arch/powerpc/kernel/rtasd.c | 2 +- arch/powerpc/kernel/smp-tbsync.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/platforms/83xx/km83xx.c | 2 +- arch/powerpc/platforms/83xx/mpc832x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_itx.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc836x_mds.c | 2 +- arch/powerpc/platforms/83xx/sbc834x.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_cds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_mds.c | 2 +- arch/powerpc/platforms/85xx/sbc8548.c | 2 +- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/cell/spufs/context.c | 2 +- arch/powerpc/platforms/chrp/smp.c | 2 +- arch/powerpc/platforms/iseries/smp.c | 2 +- arch/powerpc/platforms/powermac/backlight.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 2 +- arch/powerpc/platforms/pseries/eeh.c | 2 +- arch/powerpc/platforms/pseries/eeh_cache.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- arch/powerpc/sysdev/fsl_soc.c | 2 +- arch/powerpc/sysdev/tsi108_dev.c | 2 +- arch/s390/include/asm/atomic.h | 1 - arch/s390/kernel/dis.c | 2 +- arch/s390/kernel/traps.c | 2 +- arch/sh/include/asm/atomic.h | 1 - arch/sh/include/asm/hw_irq.h | 2 +- arch/sh/include/asm/smp.h | 2 +- arch/sh/kernel/idle.c | 2 +- arch/sh/kernel/smp.c | 2 +- arch/sh/kernel/traps_64.c | 2 +- arch/sh/kernel/unwinder.c | 2 +- arch/sparc/include/asm/atomic_32.h | 1 - arch/sparc/include/asm/atomic_64.h | 1 - arch/sparc/include/asm/prom.h | 2 +- arch/sparc/include/asm/smp_32.h | 2 +- arch/sparc/include/asm/smp_64.h | 2 +- arch/sparc/kernel/irq_64.c | 2 +- arch/sparc/kernel/leon_smp.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/sparc/kernel/smp_32.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/lib/atomic32.c | 2 +- arch/tile/include/asm/atomic.h | 9 --------- arch/tile/include/asm/atomic_32.h | 4 ++-- arch/tile/include/asm/atomic_64.h | 2 +- arch/tile/include/asm/bitops_32.h | 2 +- arch/tile/include/asm/bitops_64.h | 2 +- arch/tile/include/asm/spinlock_32.h | 2 +- arch/tile/kernel/intvec_32.S | 2 +- arch/tile/lib/atomic_32.c | 2 +- arch/tile/lib/atomic_asm_32.S | 2 +- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/atomic.h | 1 - arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/local.h | 2 +- arch/x86/include/asm/mce.h | 2 +- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/prom.h | 2 +- arch/x86/include/asm/spinlock.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/lib/atomic64_32.c | 2 +- arch/x86/mm/mmio-mod.c | 2 +- arch/xtensa/include/asm/atomic.h | 1 - arch/xtensa/kernel/process.c | 2 +- crypto/af_alg.c | 2 +- crypto/proc.c | 2 +- crypto/rng.c | 2 +- drivers/atm/ambassador.c | 2 +- drivers/atm/atmtcp.c | 2 +- drivers/atm/eni.c | 2 +- drivers/atm/eni.h | 2 +- drivers/atm/firestream.c | 2 +- drivers/atm/fore200e.c | 2 +- drivers/atm/horizon.c | 2 +- drivers/atm/idt77252.c | 2 +- drivers/atm/iphase.c | 2 +- drivers/atm/nicstar.c | 2 +- drivers/atm/suni.c | 2 +- drivers/atm/uPD98402.c | 2 +- drivers/atm/zatm.c | 2 +- drivers/base/memory.c | 2 +- drivers/base/power/sysfs.c | 2 +- drivers/block/cciss_scsi.c | 2 +- drivers/char/ipmi/ipmi_watchdog.c | 2 +- drivers/char/mspec.c | 2 +- drivers/connector/cn_proc.c | 3 ++- drivers/edac/edac_stub.c | 2 +- drivers/firewire/core-card.c | 2 +- drivers/firewire/core-device.c | 2 +- drivers/firewire/core-topology.c | 2 +- drivers/firewire/core.h | 2 +- drivers/firewire/nosy.c | 2 +- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_fence.c | 2 +- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- drivers/gpu/drm/ttm/ttm_lock.c | 2 +- drivers/gpu/drm/ttm/ttm_object.c | 2 +- drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 +- drivers/hwmon/sht15.c | 2 +- drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/ehca/ehca_tools.h | 2 +- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/srp/ib_srp.c | 2 +- drivers/isdn/gigaset/gigaset.h | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-queue-length.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/media/video/hdpvr/hdpvr-core.c | 2 +- drivers/media/video/tlg2300/pd-dvb.c | 2 +- drivers/media/video/uvc/uvc_ctrl.c | 2 +- drivers/media/video/uvc/uvc_queue.c | 2 +- drivers/media/video/uvc/uvc_v4l2.c | 2 +- drivers/media/video/uvc/uvc_video.c | 2 +- drivers/message/i2o/i2o_scsi.c | 2 +- drivers/misc/phantom.c | 2 +- drivers/net/atlx/atl1.c | 2 +- drivers/net/atlx/atl2.c | 2 +- drivers/net/atlx/atl2.h | 2 +- drivers/net/cassini.c | 2 +- drivers/net/cpmac.c | 2 +- drivers/net/cxgb3/cxgb3_offload.c | 2 +- drivers/net/cxgb3/l2t.h | 2 +- drivers/net/cxgb3/t3cdev.h | 2 +- drivers/net/cxgb4/cxgb4_uld.h | 2 +- drivers/net/cxgb4/l2t.h | 2 +- drivers/net/hamradio/6pack.c | 2 +- drivers/net/hamradio/dmascc.c | 2 +- drivers/net/ibmveth.c | 2 +- drivers/net/phy/phy.c | 2 +- drivers/net/ppp_generic.c | 2 +- drivers/net/wimax/i2400m/i2400m.h | 2 +- drivers/net/wireless/b43legacy/b43legacy.h | 2 +- drivers/net/wireless/b43legacy/dma.h | 2 +- drivers/oprofile/oprofile_stats.h | 2 +- drivers/pci/hotplug/cpci_hotplug_core.c | 2 +- drivers/pci/xen-pcifront.c | 2 +- drivers/s390/block/dasd_eer.c | 2 +- drivers/s390/char/sclp_quiesce.c | 2 +- drivers/s390/char/vmlogrdr.c | 2 +- drivers/s390/cio/device.h | 2 +- drivers/s390/cio/qdio_main.c | 2 +- drivers/s390/cio/qdio_thinint.c | 2 +- drivers/s390/crypto/ap_bus.c | 2 +- drivers/s390/crypto/zcrypt_api.c | 2 +- drivers/s390/crypto/zcrypt_cex2a.c | 2 +- drivers/s390/crypto/zcrypt_mono.c | 2 +- drivers/s390/crypto/zcrypt_pcica.c | 2 +- drivers/s390/crypto/zcrypt_pcicc.c | 2 +- drivers/s390/crypto/zcrypt_pcixcc.c | 2 +- drivers/s390/net/fsm.h | 2 +- drivers/s390/scsi/zfcp_scsi.c | 2 +- drivers/sbus/char/display7seg.c | 2 +- drivers/scsi/dpt/dpti_i2o.h | 2 +- drivers/scsi/hpsa.c | 2 +- drivers/scsi/pm8001/pm8001_sas.h | 2 +- drivers/staging/octeon/ethernet-rx.c | 2 +- drivers/staging/octeon/ethernet-tx.c | 2 +- drivers/staging/solo6x10/solo6x10.h | 2 +- drivers/staging/tidspbridge/include/dspbridge/host_os.h | 2 +- drivers/staging/winbond/mds_s.h | 2 +- drivers/staging/winbond/wb35reg_s.h | 2 +- drivers/tty/bfin_jtag_comm.c | 2 +- drivers/tty/rocket.c | 2 +- drivers/tty/serial/dz.c | 2 +- drivers/tty/serial/sb1250-duart.c | 2 +- drivers/tty/serial/zs.c | 2 +- drivers/usb/gadget/f_audio.c | 2 +- drivers/usb/gadget/f_rndis.c | 2 +- drivers/usb/gadget/uvc_queue.c | 2 +- drivers/usb/image/microtek.c | 2 +- drivers/usb/misc/appledisplay.c | 2 +- drivers/usb/serial/garmin_gps.c | 2 +- drivers/usb/wusbcore/wa-rpipe.c | 2 +- drivers/vhost/vhost.h | 2 +- drivers/video/sh_mobile_lcdcfb.c | 2 +- drivers/video/vermilion/vermilion.h | 2 +- drivers/w1/masters/matrox_w1.c | 2 +- drivers/w1/w1.c | 2 +- drivers/w1/w1_family.h | 2 +- drivers/watchdog/intel_scu_watchdog.c | 2 +- drivers/watchdog/sbc7240_wdt.c | 2 +- fs/btrfs/delayed-inode.h | 2 +- fs/direct-io.c | 2 +- fs/eventpoll.c | 2 +- fs/file_table.c | 2 +- fs/gfs2/main.c | 2 +- fs/nfs/cache_lib.h | 2 +- fs/nfs/direct.c | 2 +- fs/notify/group.c | 2 +- fs/notify/inode_mark.c | 2 +- fs/notify/mark.c | 2 +- fs/notify/notification.c | 2 +- fs/notify/vfsmount_mark.c | 2 +- fs/ntfs/inode.h | 2 +- fs/posix_acl.c | 2 +- fs/proc/meminfo.c | 2 +- include/acpi/platform/aclinux.h | 2 +- include/asm-generic/atomic.h | 2 -- include/asm-generic/local.h | 2 +- include/asm-generic/local64.h | 2 +- include/drm/ttm/ttm_lock.h | 2 +- include/linux/aio.h | 2 +- include/linux/atmdev.h | 2 +- include/linux/atomic.h | 9 +++++++++ include/linux/backing-dev.h | 2 +- include/linux/bit_spinlock.h | 2 +- include/linux/buffer_head.h | 2 +- include/linux/configfs.h | 2 +- include/linux/connector.h | 2 +- include/linux/cred.h | 2 +- include/linux/crypto.h | 2 +- include/linux/dcache.h | 2 +- include/linux/debug_locks.h | 2 +- include/linux/device.h | 2 +- include/linux/edac.h | 2 +- include/linux/fault-inject.h | 2 +- include/linux/fdtable.h | 2 +- include/linux/filter.h | 2 +- include/linux/firewire.h | 2 +- include/linux/fsnotify_backend.h | 2 +- include/linux/interrupt.h | 2 +- include/linux/jump_label.h | 2 +- include/linux/kdb.h | 2 +- include/linux/key.h | 2 +- include/linux/kgdb.h | 2 +- include/linux/kobject.h | 2 +- include/linux/mlx4/device.h | 2 +- include/linux/mman.h | 2 +- include/linux/mmzone.h | 2 +- include/linux/mount.h | 2 +- include/linux/mutex.h | 2 +- include/linux/netdevice.h | 2 +- include/linux/nfs_fs_sb.h | 2 +- include/linux/oprofile.h | 2 +- include/linux/pci.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/phy.h | 2 +- include/linux/proc_fs.h | 2 +- include/linux/quota.h | 2 +- include/linux/rwsem.h | 2 +- include/linux/sem.h | 2 +- include/linux/skbuff.h | 2 +- include/linux/sonet.h | 2 +- include/linux/spinlock.h | 2 +- include/linux/sunrpc/auth.h | 2 +- include/linux/sunrpc/cache.h | 2 +- include/linux/sunrpc/timer.h | 2 +- include/linux/swap.h | 2 +- include/linux/sysfs.h | 2 +- include/linux/vmstat.h | 2 +- include/linux/workqueue.h | 2 +- include/net/ax25.h | 2 +- include/net/cipso_ipv4.h | 2 +- include/net/flow.h | 2 +- include/net/inet_hashtables.h | 2 +- include/net/inet_timewait_sock.h | 2 +- include/net/inetpeer.h | 2 +- include/net/ip_vs.h | 2 +- include/net/lib80211.h | 2 +- include/net/llc.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/netfilter/nf_conntrack.h | 2 +- include/net/netlabel.h | 2 +- include/net/netns/conntrack.h | 2 +- include/net/sctp/structs.h | 2 +- include/pcmcia/ds.h | 2 +- include/rdma/ib_sa.h | 2 +- include/rdma/ib_verbs.h | 2 +- include/rxrpc/types.h | 2 +- include/scsi/scsi_device.h | 2 +- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- kernel/cgroup.c | 2 +- kernel/cpuset.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/rcupdate.c | 2 +- kernel/rcutorture.c | 2 +- kernel/rcutree_trace.c | 2 +- kernel/rwsem.c | 2 +- kernel/stop_machine.c | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_mmiotrace.c | 2 +- lib/atomic64.c | 2 +- lib/atomic64_test.c | 2 +- lib/crc32.c | 2 +- lib/dec_and_lock.c | 2 +- mm/init-mm.c | 2 +- mm/kmemleak.c | 2 +- mm/slob.c | 2 +- mm/vmalloc.c | 2 +- net/atm/atm_misc.c | 2 +- net/atm/clip.c | 2 +- net/atm/common.c | 2 +- net/atm/lec.c | 2 +- net/atm/proc.c | 2 +- net/bridge/br_fdb.c | 2 +- net/core/flow.c | 2 +- net/decnet/dn_fib.c | 2 +- net/decnet/dn_neigh.c | 2 +- net/decnet/dn_table.c | 2 +- net/decnet/dn_timer.c | 2 +- net/ipv4/cipso_ipv4.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/iucv/iucv.c | 2 +- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- net/netlabel/netlabel_cipso_v4.c | 2 +- net/netlabel/netlabel_kapi.c | 2 +- net/netlabel/netlabel_mgmt.c | 2 +- net/netlabel/netlabel_mgmt.h | 2 +- net/netlabel/netlabel_unlabeled.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- net/tipc/core.h | 2 +- security/selinux/hooks.c | 2 +- security/selinux/xfrm.c | 2 +- sound/pci/echoaudio/darla20.c | 2 +- sound/pci/echoaudio/darla24.c | 2 +- sound/pci/echoaudio/echo3g.c | 2 +- sound/pci/echoaudio/gina20.c | 2 +- sound/pci/echoaudio/gina24.c | 2 +- sound/pci/echoaudio/indigo.c | 2 +- sound/pci/echoaudio/indigodj.c | 2 +- sound/pci/echoaudio/indigodjx.c | 2 +- sound/pci/echoaudio/indigoio.c | 2 +- sound/pci/echoaudio/indigoiox.c | 2 +- sound/pci/echoaudio/layla20.c | 2 +- sound/pci/echoaudio/layla24.c | 2 +- sound/pci/echoaudio/mia.c | 2 +- sound/pci/echoaudio/mona.c | 2 +- sound/pci/lx6464es/lx6464es.h | 2 +- sound/sparc/dbri.c | 2 +- 439 files changed, 427 insertions(+), 448 deletions(-) (limited to 'mm') diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index e756d04b6cd5..88b7491490bc 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -199,7 +199,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /** * atomic64_add_unless - add unless the number is a given value diff --git a/arch/alpha/include/asm/local.h b/arch/alpha/include/asm/local.h index b9e3e3318371..9c94b8456043 100644 --- a/arch/alpha/include/asm/local.h +++ b/arch/alpha/include/asm/local.h @@ -2,7 +2,7 @@ #define _ALPHA_LOCAL_H #include -#include +#include typedef struct { diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8e47709160f8..8143cd7cdbfb 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index d739703608fc..4087a569b43b 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c index 0f5520d2f45f..f9f5fe830e9f 100644 --- a/arch/alpha/lib/dec_and_lock.c +++ b/arch/alpha/lib/dec_and_lock.c @@ -6,7 +6,7 @@ */ #include -#include +#include asm (".text \n\ .global _atomic_dec_and_lock \n\ diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 7e79503ab89b..4d501f1bdc9d 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -217,7 +217,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) c = old; return c != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_inc(v) atomic_add(1, v) #define atomic_dec(v) atomic_sub(1, v) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 167e3cbe1f2f..d88ff0230e82 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2d3436e9f71f..bc9f9da782cb 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index ea53f4d9b283..4159eca78945 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c index 9f2a948e0e72..0ca00050666a 100644 --- a/arch/arm/mach-bcmring/dma.c +++ b/arch/arm/mach-bcmring/dma.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include /* I don't quite understand why dc4 fails when this is set to 1 and DMA is enabled */ diff --git a/arch/arm/mach-cns3xxx/include/mach/pm.h b/arch/arm/mach-cns3xxx/include/mach/pm.h index 6eae7f764d1d..c2588cc991d1 100644 --- a/arch/arm/mach-cns3xxx/include/mach/pm.h +++ b/arch/arm/mach-cns3xxx/include/mach/pm.h @@ -11,7 +11,7 @@ #ifndef __CNS3XXX_PM_H #define __CNS3XXX_PM_H -#include +#include void cns3xxx_pwr_clk_en(unsigned int block); void cns3xxx_pwr_clk_dis(unsigned int block); diff --git a/arch/arm/mach-cns3xxx/pm.c b/arch/arm/mach-cns3xxx/pm.c index 5e579552aa54..0c04678615ce 100644 --- a/arch/arm/mach-cns3xxx/pm.c +++ b/arch/arm/mach-cns3xxx/pm.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c index 98ba9784aa15..495b3987d461 100644 --- a/arch/arm/mach-omap1/pm.c +++ b/arch/arm/mach-omap1/pm.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/clock.c b/arch/arm/mach-s3c2440/clock.c index 554e0d3ec70b..f9e6bdaf41d2 100644 --- a/arch/arm/mach-s3c2440/clock.c +++ b/arch/arm/mach-s3c2440/clock.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/s3c2442.c b/arch/arm/mach-s3c2440/s3c2442.c index 6224bad4d604..9ad99f8016a1 100644 --- a/arch/arm/mach-s3c2440/s3c2442.c +++ b/arch/arm/mach-s3c2440/s3c2442.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/s3c244x-clock.c b/arch/arm/mach-s3c2440/s3c244x-clock.c index f8d96130d1d1..7f5ea0a169a5 100644 --- a/arch/arm/mach-s3c2440/s3c244x-clock.c +++ b/arch/arm/mach-s3c2440/s3c244x-clock.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h index bbce6a1c6bb6..f229c3849f03 100644 --- a/arch/avr32/include/asm/atomic.h +++ b/arch/avr32/include/asm/atomic.h @@ -188,7 +188,6 @@ static inline int atomic_sub_if_positive(int i, atomic_t *v) #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0) -#define atomic_inc_not_zero(v) atomic_add_unless(v, 1, 0) #define atomic_dec_if_positive(v) atomic_sub_if_positive(1, v) #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h index 4c707dbe1ff9..f2cf5b714ea4 100644 --- a/arch/blackfin/include/asm/atomic.h +++ b/arch/blackfin/include/asm/atomic.h @@ -97,7 +97,6 @@ static inline void atomic_set_mask(int mask, atomic_t *v) c = old; \ c != (u); \ }) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_inc_and_test - increment and test diff --git a/arch/blackfin/include/asm/dma.h b/arch/blackfin/include/asm/dma.h index d9dbc1a53534..dac0c97242bb 100644 --- a/arch/blackfin/include/asm/dma.h +++ b/arch/blackfin/include/asm/dma.h @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/blackfin/include/asm/ipipe.h b/arch/blackfin/include/asm/ipipe.h index 9e0cc0e2534f..17b5e92e3bc6 100644 --- a/arch/blackfin/include/asm/ipipe.h +++ b/arch/blackfin/include/asm/ipipe.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index 2336093fca23..490c7caa02d9 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -11,7 +11,7 @@ # include #else -#include +#include asmlinkage int __raw_spin_is_locked_asm(volatile int *ptr); asmlinkage void __raw_spin_lock_asm(volatile int *ptr); diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c index 48808a12b427..9277905b82cf 100644 --- a/arch/blackfin/kernel/ftrace.c +++ b/arch/blackfin/kernel/ftrace.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/blackfin/kernel/ipipe.c b/arch/blackfin/kernel/ipipe.c index 486426f8a0d7..dbe11220cc53 100644 --- a/arch/blackfin/kernel/ipipe.c +++ b/arch/blackfin/kernel/ipipe.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include DEFINE_PER_CPU(struct pt_regs, __ipipe_tick_regs); diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 679d0db35256..9919d29287dc 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 1c143a4de5f5..107622aacf6b 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index c03bc3bc30c2..642c6fed43d7 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index a0843a71aaee..0b99df72d2a4 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h index 88dc9b9c4ba0..ce9f67e4d977 100644 --- a/arch/cris/include/asm/atomic.h +++ b/arch/cris/include/asm/atomic.h @@ -150,7 +150,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) cris_atomic_restore(v, flags); return ret != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h index c0092fc7d846..a78a2d70cd8b 100644 --- a/arch/cris/include/asm/bitops.h +++ b/arch/cris/include/asm/bitops.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include /* diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index c99aeab7cef7..aa585e4e979e 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -12,7 +12,7 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include +#include #include #include #include diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h index fae32c7fdcb6..b07b75f411f2 100644 --- a/arch/frv/include/asm/atomic.h +++ b/arch/frv/include/asm/atomic.h @@ -256,7 +256,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #include #endif /* _ASM_ATOMIC_H */ diff --git a/arch/frv/include/asm/hardirq.h b/arch/frv/include/asm/hardirq.h index 5fc8b6f5bc55..c62833d6ebbb 100644 --- a/arch/frv/include/asm/hardirq.h +++ b/arch/frv/include/asm/hardirq.h @@ -12,7 +12,7 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include +#include extern atomic_t irq_err_count; static inline void ack_bad_irq(int irq) diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c index a5f624a9f559..3facbc28cbbc 100644 --- a/arch/frv/kernel/irq.c +++ b/arch/frv/kernel/irq.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index 984221abb66d..b641714774ea 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -116,7 +116,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) local_irq_restore(flags); return ret != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ void atomic_clear_mask(unsigned long mask, unsigned long *v) { diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 446881439675..fdb887005dff 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -105,7 +105,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u) { diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 03afe7970748..d9f397fae03e 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -75,7 +75,7 @@ #include #include #include -#include +#include #ifdef CONFIG_NUMA #include #endif diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index 1a91c9121d17..b77768d35f93 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index be450a3e9871..0bd537b4ea6b 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 14ec641003da..559097986672 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index c4696d217ce0..6a867dc45c05 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h index d44a51e5271b..d64d894dc549 100644 --- a/arch/m32r/include/asm/atomic.h +++ b/arch/m32r/include/asm/atomic.h @@ -262,7 +262,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t *addr) { diff --git a/arch/m32r/include/asm/mmu_context.h b/arch/m32r/include/asm/mmu_context.h index a70a3df33635..a979a4198168 100644 --- a/arch/m32r/include/asm/mmu_context.h +++ b/arch/m32r/include/asm/mmu_context.h @@ -11,7 +11,7 @@ #ifndef __ASSEMBLY__ -#include +#include #include #include #include diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 179a06489b10..b0ea2f26da3b 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -10,7 +10,7 @@ */ #include -#include +#include #include /* diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 092d40a6708e..ce7aea34fdf4 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index fbd109031df3..ee6a9199561c 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index 307a573881ad..e844a2d2ba23 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -198,7 +198,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h index 3e5c254e8d1c..d68647746448 100644 --- a/arch/microblaze/include/asm/mmu_context_mm.h +++ b/arch/microblaze/include/asm/mmu_context_mm.h @@ -11,7 +11,7 @@ #ifndef _ASM_MICROBLAZE_MMU_CONTEXT_H #define _ASM_MICROBLAZE_MMU_CONTEXT_H -#include +#include #include #include #include diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index 9bd01ecb00d6..9ad567e2d425 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -21,7 +21,7 @@ #include #include -#include +#include #define HAVE_ARCH_DEVTREE_FIXUPS diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 4a02fe891ab6..833a4023648a 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -325,7 +325,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) } return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_dec_return(v) atomic_sub_return(1, (v)) #define atomic_inc_return(v) atomic_add_return(1, (v)) diff --git a/arch/mips/include/asm/hw_irq.h b/arch/mips/include/asm/hw_irq.h index 77adda297ad9..9e8ef5994c9c 100644 --- a/arch/mips/include/asm/hw_irq.h +++ b/arch/mips/include/asm/hw_irq.h @@ -8,7 +8,7 @@ #ifndef __ASM_HW_IRQ_H #define __ASM_HW_IRQ_H -#include +#include extern atomic_t irq_err_count; diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h index fffc8307a80a..94fde8d0fac1 100644 --- a/arch/mips/include/asm/local.h +++ b/arch/mips/include/asm/local.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index af42385245d5..d4fb4d852a6d 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include extern int smp_num_siblings; diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c index 9b734d74ae8e..b53970d80991 100644 --- a/arch/mips/kernel/irq.c +++ b/arch/mips/kernel/irq.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index b2259e7cd829..594ca69cb867 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c index 557ef72472e0..7a80b7cda7cc 100644 --- a/arch/mips/kernel/rtlx.c +++ b/arch/mips/kernel/rtlx.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c index cc81771b882c..fe3095160655 100644 --- a/arch/mips/kernel/smp-cmp.c +++ b/arch/mips/kernel/smp-cmp.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c index 1ec56e635d04..ce9e286f0a74 100644 --- a/arch/mips/kernel/smp-mt.c +++ b/arch/mips/kernel/smp-mt.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 32a256101082..32c1e954cd37 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smtc-proc.c b/arch/mips/kernel/smtc-proc.c index fe256559c997..928a5a61e1a6 100644 --- a/arch/mips/kernel/smtc-proc.c +++ b/arch/mips/kernel/smtc-proc.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index cedac4633741..f0895e70e283 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/sync-r4k.c b/arch/mips/kernel/sync-r4k.c index 05dd170a83f7..99f913c8d7a6 100644 --- a/arch/mips/kernel/sync-r4k.c +++ b/arch/mips/kernel/sync-r4k.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index dbb6b408f001..2cd50ad0d5c6 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/mipssim/sim_smtc.c b/arch/mips/mipssim/sim_smtc.c index 30df47258c2c..915063991f6e 100644 --- a/arch/mips/mipssim/sim_smtc.c +++ b/arch/mips/mipssim/sim_smtc.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/sgi-ip27/ip27-nmi.c b/arch/mips/sgi-ip27/ip27-nmi.c index bc4fa8dd67f3..005c29ed419a 100644 --- a/arch/mips/sgi-ip27/ip27-nmi.c +++ b/arch/mips/sgi-ip27/ip27-nmi.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 9d773a639513..041b9d69d86c 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -269,7 +269,6 @@ static inline void atomic_dec(atomic_t *v) c != (u); \ }) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /** * atomic_clear_mask - Atomically clear bits in memory diff --git a/arch/mn10300/include/asm/mmu_context.h b/arch/mn10300/include/asm/mmu_context.h index c8f6c82672ad..c67c2b5365a6 100644 --- a/arch/mn10300/include/asm/mmu_context.h +++ b/arch/mn10300/include/asm/mmu_context.h @@ -22,7 +22,7 @@ #ifndef _ASM_MMU_CONTEXT_H #define _ASM_MMU_CONTEXT_H -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 93429154e898..1ae580f38933 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -11,7 +11,7 @@ #ifndef _ASM_SPINLOCK_H #define _ASM_SPINLOCK_H -#include +#include #include #include diff --git a/arch/mn10300/include/asm/system.h b/arch/mn10300/include/asm/system.h index 8ff3e5aaca41..94b4c5e1491b 100644 --- a/arch/mn10300/include/asm/system.h +++ b/arch/mn10300/include/asm/system.h @@ -19,7 +19,7 @@ #include #include -#include +#include #if !defined(CONFIG_LAZY_SAVE_FPU) struct fpu_state_struct; diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index c5e12bfd9fcd..a45f0c7549a6 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index bd3e5e73826e..9220a75a7b43 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c index eef989c1d0c1..f9bb8cb1c14a 100644 --- a/arch/mn10300/mm/misalignment.c +++ b/arch/mn10300/mm/misalignment.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c index c58249b9525a..fe6e24906ffc 100644 --- a/arch/mn10300/proc-mn2ws0050/proc-init.c +++ b/arch/mn10300/proc-mn2ws0050/proc-init.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index f81955934aeb..192488999b63 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -220,7 +220,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_add(i,v) ((void)(__atomic_add_return( (i),(v)))) #define atomic_sub(i,v) ((void)(__atomic_add_return(-(i),(v)))) diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 4e833aa05a44..8c9b631d2a78 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -8,7 +8,7 @@ #include #include /* for BITS_PER_LONG/SHIFT_PER_LONG */ #include -#include +#include /* * HP-PARISC specific bit operations diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index 354b2aca990e..59be25764433 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index df653663d3db..a7bb757a5497 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -31,7 +31,7 @@ #include EXPORT_SYMBOL(memset); -#include +#include EXPORT_SYMBOL(__xchg8); EXPORT_SYMBOL(__xchg32); EXPORT_SYMBOL(__cmpxchg_u32); diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 828305f19cff..32d588488f04 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 8b58bf0b7d5a..f19e6604026a 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c index 353963d42059..a8bffd8af77d 100644 --- a/arch/parisc/lib/bitops.c +++ b/arch/parisc/lib/bitops.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SMP arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = { diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index b8f152ece025..b2bcbee622ea 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -212,7 +212,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return t != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_sub_and_test(a, v) (atomic_sub_return((a), (v)) == 0) #define atomic_dec_and_test(v) (atomic_dec_return((v)) == 0) diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index 2cc41c715d2b..63f2a22e9954 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -18,7 +18,7 @@ #ifndef _ASM_POWERPC_EMULATED_OPS_H #define _ASM_POWERPC_EMULATED_OPS_H -#include +#include #include diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index c57a28e52b64..c0e1bc319e35 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -14,7 +14,7 @@ #include #include -#include +#include /* Define a way to iterate across irqs. */ diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h index c2410af6bfd9..b8da91363864 100644 --- a/arch/powerpc/include/asm/local.h +++ b/arch/powerpc/include/asm/local.h @@ -2,7 +2,7 @@ #define _ARCH_POWERPC_LOCAL_H #include -#include +#include typedef struct { diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index b823536375dc..b5c91901e384 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -18,7 +18,7 @@ */ #include #include -#include +#include #define HAVE_ARCH_DEVTREE_FIXUPS diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 24582181b6ec..59dbf6abaaf3 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #ifdef CONFIG_PPC_OF_PLATFORM_PCI diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 7d28f540200c..f5ae872a2ef0 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 0e0ea941156f..d5ca8236315c 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 67f6c3b51357..481ef064c8f1 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c index 03e45c4a9ef1..640de836e466 100644 --- a/arch/powerpc/kernel/smp-tbsync.c +++ b/arch/powerpc/kernel/smp-tbsync.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f932f8a0cf0c..7bf2187dfd99 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index f8fa2fc3129f..c55129f5760a 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc832x_mds.c b/arch/powerpc/platforms/83xx/mpc832x_mds.c index 93e60f1f21a9..32a52896822f 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc832x_mds.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_itx.c b/arch/powerpc/platforms/83xx/mpc834x_itx.c index 81e44fa1c644..6b45969567d4 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_itx.c +++ b/arch/powerpc/platforms/83xx/mpc834x_itx.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_mds.c b/arch/powerpc/platforms/83xx/mpc834x_mds.c index c1b1dc50b32a..041c5177e737 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc834x_mds.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc836x_mds.c b/arch/powerpc/platforms/83xx/mpc836x_mds.c index 81c052b1353e..934cc8c46bbc 100644 --- a/arch/powerpc/platforms/83xx/mpc836x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc836x_mds.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/sbc834x.c b/arch/powerpc/platforms/83xx/sbc834x.c index 49023dbe1576..af41d8c810a8 100644 --- a/arch/powerpc/platforms/83xx/sbc834x.c +++ b/arch/powerpc/platforms/83xx/sbc834x.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index 6299a2a51ae8..2bf99786d249 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 747d1ee661fd..973b3f4a4b49 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c index ecdd8c09e4ed..d07dcb7f4ee9 100644 --- a/arch/powerpc/platforms/85xx/sbc8548.c +++ b/arch/powerpc/platforms/85xx/sbc8548.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index d809836bcf5f..7f92096fe968 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index dbb641ea90dd..f2e1dfe4bf31 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c index 0c87bcd2452a..bf4d41d8fa14 100644 --- a/arch/powerpc/platforms/cell/spufs/context.c +++ b/arch/powerpc/platforms/cell/spufs/context.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include "spufs.h" diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c index a800122e4dda..feab30bbae23 100644 --- a/arch/powerpc/platforms/chrp/smp.c +++ b/arch/powerpc/platforms/chrp/smp.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/iseries/smp.c b/arch/powerpc/platforms/iseries/smp.c index 2df48c2287bd..8bda9be06fa0 100644 --- a/arch/powerpc/platforms/iseries/smp.c +++ b/arch/powerpc/platforms/iseries/smp.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/backlight.c b/arch/powerpc/platforms/powermac/backlight.c index d679964ae2ab..c2f3e861f5ea 100644 --- a/arch/powerpc/platforms/powermac/backlight.c +++ b/arch/powerpc/platforms/powermac/backlight.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index d15fca322978..9a521dc8e485 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 46b55cf563e3..ada6e07532ec 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c index 8ed0d2d0e1b5..fc5ae767989e 100644 --- a/arch/powerpc/platforms/pseries/eeh_cache.c +++ b/arch/powerpc/platforms/pseries/eeh_cache.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 1672db2d1b0e..4e44c4dcd11c 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index 265313e8396b..2d66275e489f 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index ee056807b52c..9f51f97abb5d 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index d9db13810d15..29d756329228 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -108,7 +108,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #undef __CS_LOOP diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 1ca3d1d6a86c..45df6d456aa1 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index e9372c77cced..ffabcd9d3363 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index c7983124d99d..8ddb2635cf92 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -30,7 +30,6 @@ #define atomic_inc_and_test(v) (atomic_inc_return(v) == 0) #define atomic_sub_and_test(i,v) (atomic_sub_return((i), (v)) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_inc(v) atomic_add(1, (v)) #define atomic_dec(v) atomic_sub(1, (v)) diff --git a/arch/sh/include/asm/hw_irq.h b/arch/sh/include/asm/hw_irq.h index 603cdde813d1..693d44184058 100644 --- a/arch/sh/include/asm/hw_irq.h +++ b/arch/sh/include/asm/hw_irq.h @@ -3,7 +3,7 @@ #include #include -#include +#include extern atomic_t irq_err_count; diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h index 9070d943ddde..78b0d0f4b24b 100644 --- a/arch/sh/include/asm/smp.h +++ b/arch/sh/include/asm/smp.h @@ -8,7 +8,7 @@ #ifdef CONFIG_SMP #include -#include +#include #include #include diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 425d604e3a28..84db0d6ccd0d 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include void (*pm_idle)(void) = NULL; diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 6207561ea34a..3147a9a6fb8b 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 67110be83fd7..cd3a40483299 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/unwinder.c b/arch/sh/kernel/unwinder.c index 468889d958f4..521b5432471f 100644 --- a/arch/sh/kernel/unwinder.c +++ b/arch/sh/kernel/unwinder.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include /* * This is the most basic stack unwinder an architecture can diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 7ae128b19d3f..7646f2cef5d0 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -52,7 +52,6 @@ extern void atomic_set(atomic_t *, int); #define atomic_dec_and_test(v) (atomic_dec_return(v) == 0) #define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* This is the old 24-bit implementation. It's still used internally * by some sparc-specific code, notably the semaphore implementation. diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index bdb2ff880bdd..337139ef91be 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -85,7 +85,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic64_cmpxchg(v, o, n) \ ((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n))) diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h index 56bbaadef646..edd3d3cde460 100644 --- a/arch/sparc/include/asm/prom.h +++ b/arch/sparc/include/asm/prom.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #define OF_ROOT_NODE_ADDR_CELLS_DEFAULT 2 #define OF_ROOT_NODE_SIZE_CELLS_DEFAULT 1 diff --git a/arch/sparc/include/asm/smp_32.h b/arch/sparc/include/asm/smp_32.h index 093f10843ff2..01c51c704341 100644 --- a/arch/sparc/include/asm/smp_32.h +++ b/arch/sparc/include/asm/smp_32.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* * Private routines/data diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h index 20bca8950710..29862a9e9065 100644 --- a/arch/sparc/include/asm/smp_64.h +++ b/arch/sparc/include/asm/smp_64.h @@ -27,7 +27,7 @@ */ #include -#include +#include #include DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 4e78862d12fd..0dd8422a469c 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index fe8fb44c609c..1210fde18740 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 62a034318b18..171e8d84dc3f 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 21b125341bf7..f671e7fd6ddc 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 99cb17251bb5..4a442c32e117 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index d3c7a12ad879..1a371f8ae0b0 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -7,7 +7,7 @@ * Based on asm-parisc/atomic.h Copyright (C) 2000 Philipp Rumpf */ -#include +#include #include #include diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h index 739cfe0499d1..e3272715c3cb 100644 --- a/arch/tile/include/asm/atomic.h +++ b/arch/tile/include/asm/atomic.h @@ -121,15 +121,6 @@ static inline int atomic_read(const atomic_t *v) */ #define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0) -/** - * atomic_inc_not_zero - increment unless the number is zero - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1, so long as @v is non-zero. - * Returns non-zero if @v was non-zero, and zero otherwise. - */ -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - /* Nonexistent functions intended to cause link errors. */ extern unsigned long __xchg_called_with_bad_pointer(void); extern unsigned long __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index 92a8bee32311..246feed4794d 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -11,7 +11,7 @@ * NON INFRINGEMENT. See the GNU General Public License for * more details. * - * Do not include directly; use . + * Do not include directly; use . */ #ifndef _ASM_TILE_ATOMIC_32_H @@ -21,7 +21,7 @@ #ifndef __ASSEMBLY__ -/* Tile-specific routines to support . */ +/* Tile-specific routines to support . */ int _atomic_xchg(atomic_t *v, int n); int _atomic_xchg_add(atomic_t *v, int i); int _atomic_xchg_add_unless(atomic_t *v, int a, int u); diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 1c1e60d8ccb6..a48dda30cbcc 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -11,7 +11,7 @@ * NON INFRINGEMENT. See the GNU General Public License for * more details. * - * Do not include directly; use . + * Do not include directly; use . */ #ifndef _ASM_TILE_ATOMIC_64_H diff --git a/arch/tile/include/asm/bitops_32.h b/arch/tile/include/asm/bitops_32.h index d31ab905cfa7..571b118bfd9b 100644 --- a/arch/tile/include/asm/bitops_32.h +++ b/arch/tile/include/asm/bitops_32.h @@ -16,7 +16,7 @@ #define _ASM_TILE_BITOPS_32_H #include -#include +#include #include /* Tile-specific routines to support . */ diff --git a/arch/tile/include/asm/bitops_64.h b/arch/tile/include/asm/bitops_64.h index 68f8c5bc0679..e9c8e381ee0e 100644 --- a/arch/tile/include/asm/bitops_64.h +++ b/arch/tile/include/asm/bitops_64.h @@ -16,7 +16,7 @@ #define _ASM_TILE_BITOPS_64_H #include -#include +#include #include /* See for API comments. */ diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index a8f2c6e31a87..a5e4208d34f9 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -17,7 +17,7 @@ #ifndef _ASM_TILE_SPINLOCK_32_H #define _ASM_TILE_SPINLOCK_32_H -#include +#include #include #include #include diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index 72ade79b621b..fc94607f0bd5 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 46570211df52..771b251b409d 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 24448734f6f1..1f75a2a56101 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -70,7 +70,7 @@ */ #include -#include +#include #include #include diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 5852519b2d0f..f6f5c53dc903 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4a0b7c7e2cce..7b3ca8324b69 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 952a826ac4e5..897969bdd4e6 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -244,7 +244,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_dec_if_positive - decrement by 1 if old value positive diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 13f5504c76c0..09199052060f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2e9972468a5d..9cdae5d47e8f 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include typedef struct { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 716b48af7863..c9321f34e55b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -124,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain; #include #include -#include +#include extern int mce_disabled; extern int mce_p5_enabled; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8b5393ec1080..69021528b43c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,7 @@ #define _ASM_X86_MMU_CONTEXT_H #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index df1287019e6d..644dd885f05a 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e9e51f710e6c..ee67edf86fdd 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H -#include +#include #include #include #include diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1f2e61e28981..a1fe5c127b52 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,7 +21,7 @@ struct task_struct; struct exec_domain; #include #include -#include +#include struct thread_info { struct task_struct *task; /* main task structure */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b117efd24f71..8a439d364b94 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b24be38c8cf8..52fa56399a50 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9536b3fe43f8..5d513bc47b6b 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d7fbff..62184390a601 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 65b8f5c2eebf..610485223bdb 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f09d4bbe2d2d..b3300e6bacef 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a085ca..9682ec50180c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2b2255b1f04b..57dcbd4308fa 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index abd86e865be3..ae432ea1cd83 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "kvm_timer.h" static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 540179e8e9fa..042f6826bf57 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -4,7 +4,7 @@ #include #include -#include +#include long long atomic64_read_cx8(long long, const atomic64_t *v); EXPORT_SYMBOL(atomic64_read_cx8); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3adff7dcc148..67421f38a215 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -34,7 +34,7 @@ #include #include #include /* for ISA_START_ADDRESS */ -#include +#include #include #include diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index a96a0619d0b7..7cca2fb18baf 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -248,7 +248,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static inline void atomic_clear_mask(unsigned int mask, atomic_t *v) { diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index e3558b9a58ba..47041e7c088c 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 940d70cb5c25..ac33d5f30778 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include diff --git a/crypto/proc.c b/crypto/proc.c index 58fef67d4f4d..3808697814d7 100644 --- a/crypto/proc.c +++ b/crypto/proc.c @@ -13,7 +13,7 @@ * */ -#include +#include #include #include #include diff --git a/crypto/rng.c b/crypto/rng.c index f93cb5311182..45229ae782be 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c index bb3b016b6ce8..f8f41e0e8a8c 100644 --- a/drivers/atm/ambassador.c +++ b/drivers/atm/ambassador.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index 0b0625054a87..b22d71cac54c 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include extern int atm_init_aal5(struct atm_vcc *vcc); /* "raw" AAL5 transport */ diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 3230ea0df83c..93071417315f 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/eni.h b/drivers/atm/eni.h index 493a6932507e..dc9a62cc2605 100644 --- a/drivers/atm/eni.h +++ b/drivers/atm/eni.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "midway.h" diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 7c7b571647f9..5072f8ac16fd 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index bc9e702186dd..361f5aee3be1 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SBUS #include diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index 287506183893..b81210330aca 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index be0dbfeb541c..db06f34419cf 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #ifdef CONFIG_ATM_IDT77252_USE_SUNI diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index 957106f636ea..cb90f7a3e074 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index 6b313ee9231b..1c70c45fa044 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include "nicstar.h" #ifdef CONFIG_ATM_NICSTAR_USE_SUNI #include "suni.h" diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c index 41c56eae4c81..90f1ccca9e52 100644 --- a/drivers/atm/suni.c +++ b/drivers/atm/suni.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include "suni.h" diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c index c45ae0573bbd..5120a96b3a89 100644 --- a/drivers/atm/uPD98402.c +++ b/drivers/atm/uPD98402.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "uPD98402.h" diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index 7f8c5132ff32..d889f56e8d8c 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include "uPD98401.h" diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 45d7c8fc73bd..2840ed4668c1 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include static DEFINE_MUTEX(mem_sysfs_mutex); diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index 942d6a7c9ae1..17b7934f31cb 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include "power.h" diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 696100241a6f..951a4e33b92b 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 320668f4c3aa..3302586655c4 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #ifdef CONFIG_X86 /* diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 25d139c9dbed..5c0d96a820fa 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 0debc17c8e28..3ee1fdb31ea7 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -29,7 +29,8 @@ #include #include #include -#include +#include + #include #include diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index aab970760b75..86ad2eee1201 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -14,7 +14,7 @@ */ #include #include -#include +#include #include int edac_op_state = EDAC_OPSTATE_INVAL; diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 29d2423fae6d..85661b060ed7 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include "core.h" diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 95a471401892..8ba7f7928f1f 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 193ed9233144..94d3b494ddfb 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 0fe4e4e6eda7..b45be5767529 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -9,7 +9,7 @@ #include #include -#include +#include struct device; struct fw_card; diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c index 0618145376ad..763626b739d1 100644 --- a/drivers/firewire/nosy.c +++ b/drivers/firewire/nosy.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include "nosy.h" diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index ef37a9b5a3cc..32807baf55e2 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -60,7 +60,7 @@ * are considered as fatal) */ -#include +#include #include #include #include diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 021d2b6b556f..7fd4e3e5ad5f 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -29,7 +29,7 @@ * Dave Airlie */ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2e618b5ac465..56619f64b6bf 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #define TTM_ASSERT_LOCKED(param) #define TTM_DEBUG(fmt, arg...) diff --git a/drivers/gpu/drm/ttm/ttm_lock.c b/drivers/gpu/drm/ttm/ttm_lock.c index de41e55a944a..075daf44bce4 100644 --- a/drivers/gpu/drm/ttm/ttm_lock.c +++ b/drivers/gpu/drm/ttm/ttm_lock.c @@ -30,7 +30,7 @@ #include "ttm/ttm_lock.h" #include "ttm/ttm_module.h" -#include +#include #include #include #include diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c index ebddd443d91a..93577f2e2954 100644 --- a/drivers/gpu/drm/ttm/ttm_object.c +++ b/drivers/gpu/drm/ttm/ttm_object.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include struct ttm_object_file { struct ttm_object_device *tdev; diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c index 170e751c283e..727e93daac3b 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include "ttm/ttm_bo_driver.h" #include "ttm/ttm_page_alloc.h" diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 7d231cf5d2ce..fe4104c6b764 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include /* Commands */ #define SHT15_MEASURE_TEMP 0x03 diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0347eed4a167..40c835309e49 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -31,7 +31,7 @@ */ #include -#include +#include #include "iw_cxgb4.h" diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index f09914cccf53..54c0d23bad92 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -58,7 +58,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 73bc18465c9c..c118663e4437 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -34,7 +34,7 @@ #define TCPOPT_TIMESTAMP 8 -#include +#include #include #include #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7b6985a2e652..b3cc1e062b17 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -45,7 +45,7 @@ #include -#include +#include #include #include diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 7d5109bbd1ad..0bfa545675b8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/isdn/gigaset/gigaset.h b/drivers/isdn/gigaset/gigaset.h index 6dd360734cfd..212efaf9a4e4 100644 --- a/drivers/isdn/gigaset/gigaset.h +++ b/drivers/isdn/gigaset/gigaset.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include #define GIG_VERSION {0, 5, 0, 0} #define GIG_COMPAT {0, 4, 0, 0} diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c8827ffd85bb..bae6c4e23d3f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 819e37eaaeba..320401dec104 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include #include diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa4e570c2cb5..c3547016f0f1 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index f92b6cea9d9c..03a837aa5ce6 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "multipath queue-length" #define QL_MIN_IO 128 diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 451c3bb176d2..bfe9c2333cea 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "table" diff --git a/drivers/media/video/hdpvr/hdpvr-core.c b/drivers/media/video/hdpvr/hdpvr-core.c index a27d93b503a5..5f1db46beb4e 100644 --- a/drivers/media/video/hdpvr/hdpvr-core.c +++ b/drivers/media/video/hdpvr/hdpvr-core.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/video/tlg2300/pd-dvb.c b/drivers/media/video/tlg2300/pd-dvb.c index edd78f8b1baa..d0da11ae19df 100644 --- a/drivers/media/video/tlg2300/pd-dvb.c +++ b/drivers/media/video/tlg2300/pd-dvb.c @@ -7,7 +7,7 @@ #include "vendorcmds.h" #include -#include +#include static void dvb_urb_cleanup(struct pd_dvb_adapter *pd_dvb); diff --git a/drivers/media/video/uvc/uvc_ctrl.c b/drivers/media/video/uvc/uvc_ctrl.c index a4db26fa2f53..2c8954ec6859 100644 --- a/drivers/media/video/uvc/uvc_ctrl.c +++ b/drivers/media/video/uvc/uvc_ctrl.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "uvcvideo.h" diff --git a/drivers/media/video/uvc/uvc_queue.c b/drivers/media/video/uvc/uvc_queue.c index f90ce9fce539..677691c44500 100644 --- a/drivers/media/video/uvc/uvc_queue.c +++ b/drivers/media/video/uvc/uvc_queue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "uvcvideo.h" diff --git a/drivers/media/video/uvc/uvc_v4l2.c b/drivers/media/video/uvc/uvc_v4l2.c index 543a80395b7f..dde6533e8e6d 100644 --- a/drivers/media/video/uvc/uvc_v4l2.c +++ b/drivers/media/video/uvc/uvc_v4l2.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/video/uvc/uvc_video.c b/drivers/media/video/uvc/uvc_video.c index 49994793cc77..8244167c8915 100644 --- a/drivers/media/video/uvc/uvc_video.c +++ b/drivers/media/video/uvc/uvc_video.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c index 74fbe56321ff..c8ed7b63fdf5 100644 --- a/drivers/message/i2o/i2o_scsi.c +++ b/drivers/message/i2o/i2o_scsi.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index b05db55c8c8e..21b28fc6d912 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #define PHANTOM_VERSION "n0.9.8" diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c index 6f0e9403004b..97e6954304ea 100644 --- a/drivers/net/atlx/atl1.c +++ b/drivers/net/atlx/atl1.c @@ -44,7 +44,7 @@ * SMP torture testing */ -#include +#include #include #include diff --git a/drivers/net/atlx/atl2.c b/drivers/net/atlx/atl2.c index e0f87cf1e2ba..d4f7dda39721 100644 --- a/drivers/net/atlx/atl2.c +++ b/drivers/net/atlx/atl2.c @@ -20,7 +20,7 @@ * Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include +#include #include #include #include diff --git a/drivers/net/atlx/atl2.h b/drivers/net/atlx/atl2.h index 78344ddf4bf0..bf9016ebdd9b 100644 --- a/drivers/net/atlx/atl2.h +++ b/drivers/net/atlx/atl2.h @@ -25,7 +25,7 @@ #ifndef _ATL2_H_ #define _ATL2_H_ -#include +#include #include #ifndef _ATL2_HW_H_ diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index b414f5ae0da5..646c86bcc545 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -98,7 +98,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c index 086ce0418b29..e0638cb4b07c 100644 --- a/drivers/net/cpmac.c +++ b/drivers/net/cpmac.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include MODULE_AUTHOR("Eugene Konev "); MODULE_DESCRIPTION("TI AR7 ethernet driver (CPMAC)"); diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index 32636a1d62a5..805076c54f1b 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/cxgb3/l2t.h b/drivers/net/cxgb3/l2t.h index fd3eb07e3f40..7a12d52ed4fc 100644 --- a/drivers/net/cxgb3/l2t.h +++ b/drivers/net/cxgb3/l2t.h @@ -34,7 +34,7 @@ #include #include "t3cdev.h" -#include +#include enum { L2T_STATE_VALID, /* entry is up to date */ diff --git a/drivers/net/cxgb3/t3cdev.h b/drivers/net/cxgb3/t3cdev.h index be55e9ae74d1..705713b56636 100644 --- a/drivers/net/cxgb3/t3cdev.h +++ b/drivers/net/cxgb3/t3cdev.h @@ -33,7 +33,7 @@ #define _T3CDEV_H_ #include -#include +#include #include #include #include diff --git a/drivers/net/cxgb4/cxgb4_uld.h b/drivers/net/cxgb4/cxgb4_uld.h index 1b48c0170145..b1d39b8d141a 100644 --- a/drivers/net/cxgb4/cxgb4_uld.h +++ b/drivers/net/cxgb4/cxgb4_uld.h @@ -38,7 +38,7 @@ #include #include #include -#include +#include /* CPL message priority levels */ enum { diff --git a/drivers/net/cxgb4/l2t.h b/drivers/net/cxgb4/l2t.h index 7bd8f42378ff..02b31d0c6410 100644 --- a/drivers/net/cxgb4/l2t.h +++ b/drivers/net/cxgb4/l2t.h @@ -37,7 +37,7 @@ #include #include -#include +#include struct adapter; struct l2t_data; diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 0d283781bc5e..2a5a34d2d67b 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #define SIXPACK_VERSION "Revision: 0.3.0" diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c index 52b14256e2c0..ce555d9ac02c 100644 --- a/drivers/net/hamradio/dmascc.c +++ b/drivers/net/hamradio/dmascc.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index 838c5b673767..ba99af05bf62 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a47595760751..3cbda0851f83 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 4609bc0e2f56..10e5d985afa3 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h index 5eacc653a94d..c421a6141854 100644 --- a/drivers/net/wimax/i2400m/i2400m.h +++ b/drivers/net/wimax/i2400m/i2400m.h @@ -155,7 +155,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/wireless/b43legacy/b43legacy.h b/drivers/net/wireless/b43legacy/b43legacy.h index 17a130d18dc9..a610a352102a 100644 --- a/drivers/net/wireless/b43legacy/b43legacy.h +++ b/drivers/net/wireless/b43legacy/b43legacy.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wireless/b43legacy/dma.h b/drivers/net/wireless/b43legacy/dma.h index f89c34226288..686941c242fc 100644 --- a/drivers/net/wireless/b43legacy/dma.h +++ b/drivers/net/wireless/b43legacy/dma.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "b43legacy.h" diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h index 0b54e46c3c14..38b6fc028984 100644 --- a/drivers/oprofile/oprofile_stats.h +++ b/drivers/oprofile/oprofile_stats.h @@ -10,7 +10,7 @@ #ifndef OPROFILE_STATS_H #define OPROFILE_STATS_H -#include +#include struct oprofile_stat_struct { atomic_t sample_lost_no_mm; diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index d703e73fffa7..3fadf2f135e8 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include "cpci_hotplug.h" diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 492b7d807fe8..6fa215a38615 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 77f778b7b070..16c5208c3dc7 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include "dasd_int.h" diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c index 05909a7df8b3..a90a02c28d6a 100644 --- a/drivers/s390/char/sclp_quiesce.c +++ b/drivers/s390/char/sclp_quiesce.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index c837d7419a6a..524d988d89dd 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h index 7e297c7bb5ff..0b7245c72d5e 100644 --- a/drivers/s390/cio/device.h +++ b/drivers/s390/cio/device.h @@ -2,7 +2,7 @@ #define S390_DEVICE_H #include -#include +#include #include #include #include "io_sch.h" diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 570d4da10696..e58169c32474 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 68be6e157126..2a1d4dfaf859 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index f8134a44cefa..b77ae519d79c 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 8e65447f76b7..88ad33ed5d38 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c index 2176d00b395e..da171b5f3996 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.c +++ b/drivers/s390/crypto/zcrypt_cex2a.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_mono.c b/drivers/s390/crypto/zcrypt_mono.c index 44253fdd4136..eb313c3fb2d1 100644 --- a/drivers/s390/crypto/zcrypt_mono.c +++ b/drivers/s390/crypto/zcrypt_mono.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcica.c b/drivers/s390/crypto/zcrypt_pcica.c index 1afb69c75fea..d84816f144df 100644 --- a/drivers/s390/crypto/zcrypt_pcica.c +++ b/drivers/s390/crypto/zcrypt_pcica.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcicc.c b/drivers/s390/crypto/zcrypt_pcicc.c index aa4c050a5694..bdbdbe192993 100644 --- a/drivers/s390/crypto/zcrypt_pcicc.c +++ b/drivers/s390/crypto/zcrypt_pcicc.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcixcc.c b/drivers/s390/crypto/zcrypt_pcixcc.c index 4f85eb725f4f..dd4737808e06 100644 --- a/drivers/s390/crypto/zcrypt_pcixcc.c +++ b/drivers/s390/crypto/zcrypt_pcixcc.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/net/fsm.h b/drivers/s390/net/fsm.h index 1e8b235d95b5..a4510cf59034 100644 --- a/drivers/s390/net/fsm.h +++ b/drivers/s390/net/fsm.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include /** * Define this to get debugging messages. diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 2a4991d6d4d5..7cac873c7383 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include "zfcp_ext.h" #include "zfcp_dbf.h" #include "zfcp_fc.h" diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c index 740da4465447..965a1fccd66a 100644 --- a/drivers/sbus/char/display7seg.c +++ b/drivers/sbus/char/display7seg.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include /* put_/get_user */ #include diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index 179ad77f6cc9..bd9e31e16249 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 6bba23a26303..c6f99b1d2383 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include "hpsa_cmd.h" #include "hpsa.h" diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index aa05e661d113..b97c8ab0c20e 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include "pm8001_defs.h" #define DRV_NAME "pm8001" diff --git a/drivers/staging/octeon/ethernet-rx.c b/drivers/staging/octeon/ethernet-rx.c index 0f22f0f47446..1a7c19ae766f 100644 --- a/drivers/staging/octeon/ethernet-rx.c +++ b/drivers/staging/octeon/ethernet-rx.c @@ -42,7 +42,7 @@ #include #endif /* CONFIG_XFRM */ -#include +#include #include diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index 6227571149f5..b445cd63f901 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -38,7 +38,7 @@ #include #endif /* CONFIG_XFRM */ -#include +#include #include diff --git a/drivers/staging/solo6x10/solo6x10.h b/drivers/staging/solo6x10/solo6x10.h index fd59b093dd4d..17c06bd6cc91 100644 --- a/drivers/staging/solo6x10/solo6x10.h +++ b/drivers/staging/solo6x10/solo6x10.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/staging/tidspbridge/include/dspbridge/host_os.h b/drivers/staging/tidspbridge/include/dspbridge/host_os.h index 1a38896f4331..a2f31c69d12e 100644 --- a/drivers/staging/tidspbridge/include/dspbridge/host_os.h +++ b/drivers/staging/tidspbridge/include/dspbridge/host_os.h @@ -18,7 +18,7 @@ #define _HOST_OS_H_ #include -#include +#include #include #include #include diff --git a/drivers/staging/winbond/mds_s.h b/drivers/staging/winbond/mds_s.h index eeedf0186365..07d835b3b706 100644 --- a/drivers/staging/winbond/mds_s.h +++ b/drivers/staging/winbond/mds_s.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include "localpara.h" #include "mac_structures.h" diff --git a/drivers/staging/winbond/wb35reg_s.h b/drivers/staging/winbond/wb35reg_s.h index eb274ffdd1ba..dc79faa4029f 100644 --- a/drivers/staging/winbond/wb35reg_s.h +++ b/drivers/staging/winbond/wb35reg_s.h @@ -3,7 +3,7 @@ #include #include -#include +#include struct hw_data; diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c index 03c285bb2f18..3a997760ec32 100644 --- a/drivers/tty/bfin_jtag_comm.c +++ b/drivers/tty/bfin_jtag_comm.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #define pr_init(fmt, args...) ({ static const __initconst char __fmt[] = fmt; printk(__fmt, ## args); }) diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c index 13043e8d37fe..6a1241c7f841 100644 --- a/drivers/tty/rocket.c +++ b/drivers/tty/rocket.c @@ -83,7 +83,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c index 57421d776329..ddc487a2d42f 100644 --- a/drivers/tty/serial/dz.c +++ b/drivers/tty/serial/dz.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c index ea2340b814e9..6bc2e3f876f4 100644 --- a/drivers/tty/serial/sb1250-duart.c +++ b/drivers/tty/serial/sb1250-duart.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c index 1a7fd3e70315..0aebd7121b56 100644 --- a/drivers/tty/serial/zs.c +++ b/drivers/tty/serial/zs.c @@ -65,7 +65,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c index 02a02700b51d..a9a4eade7e80 100644 --- a/drivers/usb/gadget/f_audio.c +++ b/drivers/usb/gadget/f_audio.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "u_audio.h" diff --git a/drivers/usb/gadget/f_rndis.c b/drivers/usb/gadget/f_rndis.c index 8f3eae90919f..3ea4666be3d0 100644 --- a/drivers/usb/gadget/f_rndis.c +++ b/drivers/usb/gadget/f_rndis.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include "u_ether.h" #include "rndis.h" diff --git a/drivers/usb/gadget/uvc_queue.c b/drivers/usb/gadget/uvc_queue.c index f7395ac5dc17..aa0ad34e0f1f 100644 --- a/drivers/usb/gadget/uvc_queue.c +++ b/drivers/usb/gadget/uvc_queue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "uvc.h" diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c index a0037961e5bd..27e209a7222f 100644 --- a/drivers/usb/image/microtek.c +++ b/drivers/usb/image/microtek.c @@ -131,7 +131,7 @@ #include #include -#include +#include #include #include "../../scsi/scsi.h" #include diff --git a/drivers/usb/misc/appledisplay.c b/drivers/usb/misc/appledisplay.c index 68ab460a735c..ac0d75a9005a 100644 --- a/drivers/usb/misc/appledisplay.c +++ b/drivers/usb/misc/appledisplay.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #define APPLE_VENDOR_ID 0x05AC diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index b0a7a9e909a4..1a49ca9c8ea5 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/usb/wusbcore/wa-rpipe.c b/drivers/usb/wusbcore/wa-rpipe.c index ca80171f42c6..2acc7f504c51 100644 --- a/drivers/usb/wusbcore/wa-rpipe.c +++ b/drivers/usb/wusbcore/wa-rpipe.c @@ -58,7 +58,7 @@ * destination address. */ #include -#include +#include #include #include diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 14c9abf0d800..a801e2821d03 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* This is for zerocopy, used buffer len is set to 1 when lower device DMA * done */ diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c index 019dbd3f12b2..b048417247e8 100644 --- a/drivers/video/sh_mobile_lcdcfb.c +++ b/drivers/video/sh_mobile_lcdcfb.c @@ -24,7 +24,7 @@ #include #include #include