diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 744 |
1 files changed, 295 insertions, 449 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ea8592b90696..7d9d06ba184b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,63 +26,38 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <linux/tracepoint.h> #include "internal.h" -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ -struct wb_writeback_args { +struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; - unsigned int sb_pinned:1; -}; -/* - * Work items for the bdi_writeback threads - */ -struct bdi_work { struct list_head list; /* pending work list */ - struct rcu_head rcu_head; /* for RCU free/clear of work */ - - unsigned long seen; /* threads that have seen this work */ - atomic_t pending; /* number of threads still to do work */ - - struct wb_writeback_args args; /* writeback arguments */ - - unsigned long state; /* flag bits, see WS_* */ -}; - -enum { - WS_USED_B = 0, - WS_ONSTACK_B, + struct completion *done; /* set if the caller waits */ }; -#define WS_USED (1 << WS_USED_B) -#define WS_ONSTACK (1 << WS_ONSTACK_B) +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/writeback.h> -static inline bool bdi_work_on_stack(struct bdi_work *work) -{ - return test_bit(WS_ONSTACK_B, &work->state); -} +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) -static inline void bdi_work_init(struct bdi_work *work, - struct wb_writeback_args *args) -{ - INIT_RCU_HEAD(&work->rcu_head); - work->args = *args; - work->state = WS_USED; -} +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; /** * writeback_in_progress - determine whether there is writeback in progress @@ -93,196 +68,84 @@ static inline void bdi_work_init(struct bdi_work *work, */ int writeback_in_progress(struct backing_dev_info *bdi) { - return !list_empty(&bdi->work_list); -} - -static void bdi_work_clear(struct bdi_work *work) -{ - clear_bit(WS_USED_B, &work->state); - smp_mb__after_clear_bit(); - /* - * work can have disappeared at this point. bit waitq functions - * should be able to tolerate this, provided bdi_sched_wait does - * not dereference it's pointer argument. - */ - wake_up_bit(&work->state, WS_USED_B); -} - -static void bdi_work_free(struct rcu_head *head) -{ - struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - - if (!bdi_work_on_stack(work)) - kfree(work); - else - bdi_work_clear(work); -} - -static void wb_work_complete(struct bdi_work *work) -{ - const enum writeback_sync_modes sync_mode = work->args.sync_mode; - int onstack = bdi_work_on_stack(work); - - /* - * For allocated work, we can clear the done/seen bit right here. - * For on-stack work, we need to postpone both the clear and free - * to after the RCU grace period, since the stack could be invalidated - * as soon as bdi_work_clear() has done the wakeup. - */ - if (!onstack) - bdi_work_clear(work); - if (sync_mode == WB_SYNC_NONE || onstack) - call_rcu(&work->rcu_head, bdi_work_free); + return test_bit(BDI_writeback_running, &bdi->state); } -static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) { - /* - * The caller has retrieved the work arguments from this work, - * drop our reference. If this is the last ref, delete and free it - */ - if (atomic_dec_and_test(&work->pending)) { - struct backing_dev_info *bdi = wb->bdi; - - spin_lock(&bdi->wb_lock); - list_del_rcu(&work->list); - spin_unlock(&bdi->wb_lock); - - wb_work_complete(work); - } -} + trace_writeback_queue(bdi, work); -static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) -{ - work->seen = bdi->wb_mask; - BUG_ON(!work->seen); - atomic_set(&work->pending, bdi->wb_cnt); - BUG_ON(!bdi->wb_cnt); - - /* - * list_add_tail_rcu() contains the necessary barriers to - * make sure the above stores are seen before the item is - * noticed on the list - */ - spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&work->list, &bdi->work_list); - spin_unlock(&bdi->wb_lock); - - /* - * If the default thread isn't there, make sure we add it. When - * it gets created and wakes up, we'll run this work. - */ - if (unlikely(list_empty_careful(&bdi->wb_list))) + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * The bdi thread isn't there, wake up the forker thread which + * will create and run it. + */ + trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); - else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); } + spin_unlock_bh(&bdi->wb_lock); } -/* - * Used for on-stack allocated work items. The caller needs to wait until - * the wb threads have acked the work before it's safe to continue. - */ -static void bdi_wait_on_work_clear(struct bdi_work *work) -{ - wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); -} - -static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args, - int wait) +static void +__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + bool range_cyclic, bool for_background) { - struct bdi_work *work; + struct wb_writeback_work *work; /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - bdi_work_init(work, args); - bdi_queue_work(bdi, work); - if (wait) - bdi_wait_on_work_clear(work); - } else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + if (bdi->wb.task) { + trace_writeback_nowork(bdi); + wake_up_process(bdi->wb.task); + } + return; } + + work->sync_mode = WB_SYNC_NONE; + work->nr_pages = nr_pages; + work->range_cyclic = range_cyclic; + work->for_background = for_background; + + bdi_queue_work(bdi, work); } /** - * bdi_sync_writeback - start and wait for writeback + * bdi_start_writeback - start writeback * @bdi: the backing device to write from - * @sb: write inodes from this super_block + * @nr_pages: the number of pages to write * * Description: - * This does WB_SYNC_ALL data integrity writeback and waits for the - * IO to complete. Callers must hold the sb s_umount semaphore for - * reading, to avoid having the super disappear before we are done. + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarentees on + * completion. Caller need not hold sb s_umount semaphore. + * */ -static void bdi_sync_writeback(struct backing_dev_info *bdi, - struct super_block *sb) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - struct wb_writeback_args args = { - .sb = sb, - .sync_mode = WB_SYNC_ALL, - .nr_pages = LONG_MAX, - .range_cyclic = 0, - /* - * Setting sb_pinned is not necessary for WB_SYNC_ALL, but - * lets make it explicitly clear. - */ - .sb_pinned = 1, - }; - struct bdi_work work; - - bdi_work_init(&work, &args); - work.state |= WS_ONSTACK; - - bdi_queue_work(bdi, &work); - bdi_wait_on_work_clear(&work); + __bdi_start_writeback(bdi, nr_pages, true, false); } /** - * bdi_start_writeback - start writeback + * bdi_start_background_writeback - start background writeback * @bdi: the backing device to write from - * @sb: write inodes from this super_block - * @nr_pages: the number of pages to write - * @sb_locked: caller already holds sb umount sem. * * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * This does WB_SYNC_NONE background writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller specifies whether sb umount sem is held already or not. - * + * completion. Caller need not hold sb s_umount semaphore. */ -void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages, int sb_locked) +void bdi_start_background_writeback(struct backing_dev_info *bdi) { - struct wb_writeback_args args = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .nr_pages = nr_pages, - .range_cyclic = 1, - .sb_pinned = sb_locked, - }; - - /* - * We treat @nr_pages=0 as the special case to do background writeback, - * ie. to sync pages until the background dirty threshold is reached. - */ - if (!nr_pages) { - args.nr_pages = LONG_MAX; - args.for_background = 1; - } - - bdi_alloc_queue_work(bdi, &args, sb_locked); + __bdi_start_writeback(bdi, LONG_MAX, true, true); } /* @@ -386,10 +249,18 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. + * Before + * newly dirtied b_dirty b_io b_more_io + * =============> gf edc BA + * After + * newly dirtied b_dirty b_io b_more_io + * =============> g fBAedc + * | + * +--> dequeue for IO */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&wb->b_more_io, wb->b_io.prev); + list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -499,63 +370,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; - if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { - /* - * More pages get dirtied by a fast dirtier. - */ - goto select_queue; - } else if (inode->i_state & I_DIRTY) { - /* - * At least XFS will redirty the inode during the - * writeback (delalloc) and on io completion (isize). - */ - redirty_tail(inode); - } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (!(inode->i_state & I_FREEING)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() - * sometimes bales out without doing anything. Redirty - * the inode; Move it from b_io onto b_more_io/b_dirty. - */ - /* - * akpm: if the caller was the kupdate function we put - * this inode at the head of b_dirty so it gets first - * consideration. Otherwise, move it to the tail, for - * the reasons described there. I'm not really sure - * how much sense this makes. Presumably I had a good - * reasons for doing it this way, and I'd rather not - * muck with it at present. + * sometimes bales out without doing anything. */ - if (wbc->for_kupdate) { + inode->i_state |= I_DIRTY_PAGES; + if (wbc->nr_to_write <= 0) { /* - * For the kupdate function we move the inode - * to b_more_io so it will get more writeout as - * soon as the queue becomes uncongested. + * slice used up: queue for next turn */ - inode->i_state |= I_DIRTY_PAGES; -select_queue: - if (wbc->nr_to_write <= 0) { - /* - * slice used up: queue for next turn - */ - requeue_io(inode); - } else { - /* - * somehow blocked: retry later - */ - redirty_tail(inode); - } + requeue_io(inode); } else { /* - * Otherwise fully redirty the inode so that - * other inodes on this superblock will get some - * writeout. Otherwise heavy writing to one - * file would indefinitely suspend writeout of - * all the other files. + * Writeback blocked by something other than + * congestion. Delay the inode for some time to + * avoid spinning on the CPU (100% iowait) + * retrying writeback of the dirty page/inode + * that cannot be performed immediately. */ - inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } + } else if (inode->i_state & I_DIRTY) { + /* + * Filesystems can dirty the inode during writeback + * operations, such as delayed allocation during + * submission or metadata updates after data IO + * completion. + */ + redirty_tail(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -572,75 +416,69 @@ select_queue: return ret; } -static void unpin_sb_for_writeback(struct super_block *sb) -{ - up_read(&sb->s_umount); - put_super(sb); -} - -enum sb_pin_state { - SB_PINNED, - SB_NOT_PINNED, - SB_PIN_FAILED -}; - /* - * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * For background writeback the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't * go away while we are writing inodes from it. */ -static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, - struct super_block *sb) +static bool pin_sb_for_writeback(struct super_block *sb) { - /* - * Caller must already hold the ref for this - */ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - return SB_NOT_PINNED; - } spin_lock(&sb_lock); + if (list_empty(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } + sb->s_count++; + spin_unlock(&sb_lock); + if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) { - spin_unlock(&sb_lock); - return SB_PINNED; - } - /* - * umounted, drop rwsem again and fall through to failure - */ + if (sb->s_root) + return true; up_read(&sb->s_umount); } - sb->s_count--; - spin_unlock(&sb_lock); - return SB_PIN_FAILED; + + put_super(sb); + return false; } /* * Write a portion of b_io inodes which belong to @sb. - * If @wbc->sb != NULL, then find and write all such + * + * If @only_this_sb is true, then find and write all such * inodes. Otherwise write only ones which go sequentially * in reverse order. + * * Return 1, if the caller writeback routine should be * interrupted. Otherwise return 0. */ -static int writeback_sb_inodes(struct super_block *sb, - struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, + struct writeback_control *wbc, bool only_this_sb) { while (!list_empty(&wb->b_io)) { long pages_skipped; struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - if (wbc->sb && sb != inode->i_sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - if (sb != inode->i_sb) - /* finish with this superblock */ + + if (inode->i_sb != sb) { + if (only_this_sb) { + /* + * We only want to write back data for this + * superblock, move all inodes not belonging + * to it back onto the dirty list. + */ + redirty_tail(inode); + continue; + } + + /* + * The inode belongs to a different superblock. + * Bounce back to the caller to unpin this and + * pin the next superblock. + */ return 0; + } + if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; @@ -652,7 +490,7 @@ static int writeback_sb_inodes(struct super_block *sb, if (inode_dirtied_after(inode, wbc->wb_start)) return 1; - BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); + BUG_ON(inode->i_state & I_FREEING); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); @@ -678,12 +516,13 @@ static int writeback_sb_inodes(struct super_block *sb, return 1; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { int ret = 0; - wbc->wb_start = jiffies; /* livelock avoidance */ + if (!wbc->wb_start) + wbc->wb_start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -692,24 +531,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); struct super_block *sb = inode->i_sb; - enum sb_pin_state state; - - if (wbc->sb && sb != wbc->sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - state = pin_sb_for_writeback(wbc, sb); - if (state == SB_PIN_FAILED) { + if (!pin_sb_for_writeback(sb)) { requeue_io(inode); continue; } - ret = writeback_sb_inodes(sb, wb, wbc); + ret = writeback_sb_inodes(sb, wb, wbc, false); + drop_super(sb); - if (state == SB_PINNED) - unpin_sb_for_writeback(sb); if (ret) break; } @@ -717,11 +546,16 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, /* Leave any unwritten inodes on b_io */ } -void writeback_inodes_wbc(struct writeback_control *wbc) +static void __writeback_inodes_sb(struct super_block *sb, + struct bdi_writeback *wb, struct writeback_control *wbc) { - struct backing_dev_info *bdi = wbc->bdi; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); - writeback_inodes_wb(&bdi->wb, wbc); + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + writeback_sb_inodes(sb, wb, wbc, true); + spin_unlock(&inode_lock); } /* @@ -737,7 +571,7 @@ static inline bool over_bground_thresh(void) { unsigned long background_thresh, dirty_thresh; - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + global_dirty_limits(&background_thresh, &dirty_thresh); return (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); @@ -759,17 +593,14 @@ static inline bool over_bground_thresh(void) * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, - struct wb_writeback_args *args) + struct wb_writeback_work *work) { struct writeback_control wbc = { - .bdi = wb->bdi, - .sb = args->sb, - .sync_mode = args->sync_mode, + .sync_mode = work->sync_mode, .older_than_this = NULL, - .for_kupdate = args->for_kupdate, - .for_background = args->for_background, - .range_cyclic = args->range_cyclic, - .sb_pinned = args->sb_pinned, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, }; unsigned long oldest_jif; long wrote = 0; @@ -785,25 +616,33 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* * Stop writeback when nr_pages has been consumed */ - if (args->nr_pages <= 0) + if (work->nr_pages <= 0) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (args->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh()) break; wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; - writeback_inodes_wb(wb, &wbc); - args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + + trace_wbc_writeback_start(&wbc, wb->bdi); + if (work->sb) + __writeback_inodes_sb(work->sb, wb, &wbc); + else + writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* @@ -830,6 +669,7 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); @@ -839,31 +679,21 @@ static long wb_writeback(struct bdi_writeback *wb, } /* - * Return the next bdi_work struct that hasn't been processed by this - * wb thread yet. ->seen is initially set for each thread that exists - * for this device, when a thread first notices a piece of work it - * clears its bit. Depending on writeback type, the thread will notify - * completion on either receiving the work (WB_SYNC_NONE) or after - * it is done (WB_SYNC_ALL). + * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, - struct bdi_writeback *wb) +static struct wb_writeback_work * +get_next_work_item(struct backing_dev_info *bdi) { - struct bdi_work *work, *ret = NULL; - - rcu_read_lock(); + struct wb_writeback_work *work = NULL; - list_for_each_entry_rcu(work, &bdi->work_list, list) { - if (!test_bit(wb->nr, &work->seen)) - continue; - clear_bit(wb->nr, &work->seen); - - ret = work; - break; + spin_lock_bh(&bdi->wb_lock); + if (!list_empty(&bdi->work_list)) { + work = list_entry(bdi->work_list.next, + struct wb_writeback_work, list); + list_del_init(&work->list); } - - rcu_read_unlock(); - return ret; + spin_unlock_bh(&bdi->wb_lock); + return work; } static long wb_check_old_data_flush(struct bdi_writeback *wb) @@ -888,14 +718,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) (inodes_stat.nr_inodes - inodes_stat.nr_unused); if (nr_pages) { - struct wb_writeback_args args = { + struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, }; - return wb_writeback(wb, &args); + return wb_writeback(wb, &work); } return 0; @@ -907,42 +737,37 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; - struct bdi_work *work; + struct wb_writeback_work *work; long wrote = 0; - while ((work = get_next_work_item(bdi, wb)) != NULL) { - struct wb_writeback_args args = work->args; - int post_clear; - + set_bit(BDI_writeback_running, &wb->bdi->state); + while ((work = get_next_work_item(bdi)) != NULL) { /* * Override sync mode, in case we must wait for completion + * because this thread is exiting now. */ if (force_wait) - work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; + work->sync_mode = WB_SYNC_ALL; - post_clear = WB_SYNC_ALL || args.sb_pinned; - - /* - * If this isn't a data integrity operation, just notify - * that we have seen this work and we are now starting it. - */ - if (!post_clear) - wb_clear_pending(wb, work); + trace_writeback_exec(bdi, work); - wrote += wb_writeback(wb, &args); + wrote += wb_writeback(wb, work); /* - * This is a data integrity writeback, so only do the - * notification when we have completed the work. + * Notify the caller of completion if this is a synchronous + * work item, otherwise just free it. */ - if (post_clear) - wb_clear_pending(wb, work); + if (work->done) + complete(work->done); + else + kfree(work); } /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); + clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; } @@ -951,84 +776,88 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct bdi_writeback *wb) +int bdi_writeback_thread(void *data) { - unsigned long last_active = jiffies; - unsigned long wait_jiffies = -1UL; + struct bdi_writeback *wb = data; + struct backing_dev_info *bdi = wb->bdi; long pages_written; + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + wb->last_active = jiffies; + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + trace_writeback_thread_start(bdi); + while (!kthread_should_stop()) { + /* + * Remove own delayed wake-up timer, since we are already awake + * and we'll take care of the preriodic write-back. + */ + del_timer(&wb->wakeup_timer); + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + if (pages_written) - last_active = jiffies; - else if (wait_jiffies != -1UL) { - unsigned long max_idle; + wb->last_active = jiffies; - /* - * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the task will get - * recreated automatically. - */ - max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + last_active)) - break; + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&bdi->work_list)) { + __set_current_state(TASK_RUNNING); + continue; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); - } else { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty_careful(&wb->bdi->work_list) && - !kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); + if (wb_has_dirty_io(wb) && dirty_writeback_interval) + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + else { + /* + * We have nothing to do, so can go sleep without any + * timeout and save power. When a work is queued or + * something is made dirty - we will be woken up. + */ + schedule(); } try_to_freeze(); } + /* Flush any work that raced with us exiting */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); return 0; } + /* - * Schedule writeback for all backing devices. This does WB_SYNC_NONE - * writeback, for integrity writeback see bdi_sync_writeback(). + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. */ -static void bdi_writeback_all(struct super_block *sb, long nr_pages) +void wakeup_flusher_threads(long nr_pages) { - struct wb_writeback_args args = { - .sb = sb, - .nr_pages = nr_pages, - .sync_mode = WB_SYNC_NONE, - }; struct backing_dev_info *bdi; - rcu_read_lock(); + if (!nr_pages) { + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + } + rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - - bdi_alloc_queue_work(bdi, &args, 0); + __bdi_start_writeback(bdi, nr_pages, false, false); } - rcu_read_unlock(); } -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. - */ -void wakeup_flusher_threads(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - bdi_writeback_all(NULL, nr_pages); -} - static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1081,6 +910,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1126,7 +957,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (hlist_unhashed(&inode->i_hash)) goto out; } - if (inode->i_state & (I_FREEING|I_CLEAR)) + if (inode->i_state & I_FREEING) goto out; /* @@ -1134,22 +965,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - struct backing_dev_info *bdi = wb->bdi; - - if (bdi_cap_writeback_dirty(bdi) && - !test_bit(BDI_registered, &bdi->state)) { - WARN_ON(1); - printk(KERN_ERR "bdi-%s not registered\n", - bdi->name); + bdi = inode_to_bdi(inode); + + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_list, &bdi->wb.b_dirty); } } out: spin_unlock(&inode_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1192,7 +1032,7 @@ static void wait_sb_inodes(struct super_block *sb) list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) @@ -1220,18 +1060,6 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } -static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) -{ - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); -} - /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1243,21 +1071,24 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) */ void writeback_inodes_sb(struct super_block *sb) { - __writeback_inodes_sb(sb, 0); -} -EXPORT_SYMBOL(writeback_inodes_sb); + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .done = &done, + }; -/** - * writeback_inodes_sb_locked - writeback dirty inodes from given super_block - * @sb: the superblock - * - * Like writeback_inodes_sb(), except the caller already holds the - * sb umount sem. - */ -void writeback_inodes_sb_locked(struct super_block *sb) -{ - __writeback_inodes_sb(sb, 1); + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + work.nr_pages = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); } +EXPORT_SYMBOL(writeback_inodes_sb); /** * writeback_inodes_sb_if_idle - start writeback if none underway @@ -1269,7 +1100,9 @@ void writeback_inodes_sb_locked(struct super_block *sb) int writeback_inodes_sb_if_idle(struct super_block *sb) { if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); writeback_inodes_sb(sb); + up_read(&sb->s_umount); return 1; } else return 0; @@ -1285,7 +1118,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); */ void sync_inodes_sb(struct super_block *sb) { - bdi_sync_writeback(sb->s_bdi, sb); + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + .done = &done, + }; + + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); + wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); |