diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 197 |
1 files changed, 144 insertions, 53 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1e23c33ea5cf..59c6e4956786 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -79,13 +79,14 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +static inline struct inode *wb_inode(struct list_head *head) { - trace_writeback_queue(bdi, work); + return list_entry(head, struct inode, i_wb_list); +} - spin_lock_bh(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); +/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ +static void bdi_wakeup_flusher(struct backing_dev_info *bdi) +{ if (bdi->wb.task) { wake_up_process(bdi->wb.task); } else { @@ -93,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * The bdi thread isn't there, wake up the forker thread which * will create and run it. */ - trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (!bdi->wb.task) + trace_writeback_nothread(bdi, work); + bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); } static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, bool for_background) + bool range_cyclic) { struct wb_writeback_work *work; @@ -121,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; - work->for_background = for_background; bdi_queue_work(bdi, work); } @@ -139,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - __bdi_start_writeback(bdi, nr_pages, true, false); + __bdi_start_writeback(bdi, nr_pages, true); } /** @@ -147,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) * @bdi: the backing device to write from * * Description: - * This does WB_SYNC_NONE background writeback. The IO is only - * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. */ void bdi_start_background_writeback(struct backing_dev_info *bdi) { - __bdi_start_writeback(bdi, LONG_MAX, true, true); + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + spin_lock_bh(&bdi->wb_lock); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -172,11 +191,11 @@ static void redirty_tail(struct inode *inode) if (!list_empty(&wb->b_dirty)) { struct inode *tail; - tail = list_entry(wb->b_dirty.next, struct inode, i_list); + tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_wb_list, &wb->b_dirty); } /* @@ -186,7 +205,7 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - list_move(&inode->i_list, &wb->b_more_io); + list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -227,14 +246,14 @@ static void move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; while (!list_empty(delaying_queue)) { - inode = list_entry(delaying_queue->prev, struct inode, i_list); + inode = wb_inode(delaying_queue->prev); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; - list_move(&inode->i_list, &tmp); + list_move(&inode->i_wb_list, &tmp); } /* just one sb in list, splice to dispatch_queue and we're done */ @@ -245,12 +264,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* Move inodes from one superblock together */ while (!list_empty(&tmp)) { - inode = list_entry(tmp.prev, struct inode, i_list); - sb = inode->i_sb; + sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { - inode = list_entry(pos, struct inode, i_list); + inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_list, dispatch_queue); + list_move(&inode->i_wb_list, dispatch_queue); } } } @@ -408,16 +426,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completion. */ redirty_tail(inode); - } else if (atomic_read(&inode->i_count)) { - /* - * The inode is clean, inuse - */ - list_move(&inode->i_list, &inode_in_use); } else { /* - * The inode is clean, unused + * The inode is clean. At this point we either have + * a reference to the inode or it's on it's way out. + * No need to add it back to the LRU. */ - list_move(&inode->i_list, &inode_unused); + list_del_init(&inode->i_wb_list); } } inode_sync_complete(inode); @@ -465,8 +480,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, { while (!list_empty(&wb->b_io)) { long pages_skipped; - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); + struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { if (only_this_sb) { @@ -487,10 +501,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, return 0; } - if (inode->i_state & (I_NEW | I_WILL_FREE)) { + /* + * Don't bother with new inodes or inodes beeing freed, first + * kind does not need peridic writeout yet, and for the latter + * kind writeout is handled by the freer. + */ + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { requeue_io(inode); continue; } + /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -498,7 +518,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, if (inode_dirtied_after(inode, wbc->wb_start)) return 1; - BUG_ON(inode->i_state & I_FREEING); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); @@ -536,8 +555,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); + struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!pin_sb_for_writeback(sb)) { @@ -582,7 +600,7 @@ static inline bool over_bground_thresh(void) global_dirty_limits(&background_thresh, &dirty_thresh); return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) >= background_thresh); + global_page_state(NR_UNSTABLE_NFS) > background_thresh); } /* @@ -612,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; + long write_chunk; struct inode *inode; if (wbc.for_kupdate) { @@ -624,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (wbc.sync_mode == WB_SYNC_NONE) + write_chunk = MAX_WRITEBACK_PAGES; + else + write_chunk = LONG_MAX; + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* @@ -633,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb, break; /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + + /* * For background writeout, stop when we are below the * background dirty threshold */ @@ -640,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; trace_wbc_writeback_start(&wbc, wb->bdi); @@ -650,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb, writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); - work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; /* * If we consumed everything, see if we have more @@ -666,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Did we write something? Try for more */ - if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + if (wbc.nr_to_write < write_chunk) continue; /* * Nothing written. Wait for some inode to @@ -675,8 +722,7 @@ static long wb_writeback(struct bdi_writeback *wb, */ spin_lock(&inode_lock); if (!list_empty(&wb->b_more_io)) { - inode = list_entry(wb->b_more_io.prev, - struct inode, i_list); + inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } @@ -704,6 +750,34 @@ get_next_work_item(struct backing_dev_info *bdi) return work; } +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) +{ + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} + +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh()) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -721,9 +795,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; wb->last_old_flush = jiffies; - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { @@ -775,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; @@ -790,7 +863,7 @@ int bdi_writeback_thread(void *data) struct backing_dev_info *bdi = wb->bdi; long pages_written; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); wb->last_active = jiffies; @@ -861,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, false); + __bdi_start_writeback(bdi, nr_pages, false); } rcu_read_unlock(); } @@ -962,7 +1035,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) goto out; } if (inode->i_state & I_FREEING) @@ -990,7 +1063,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &bdi->wb.b_dirty); + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); } } out: @@ -1103,9 +1176,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); */ void writeback_inodes_sb(struct super_block *sb) { - return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused)); + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); } EXPORT_SYMBOL(writeback_inodes_sb); @@ -1154,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * super_block. The number of pages synced is returned. + * super_block. */ void sync_inodes_sb(struct super_block *sb) { @@ -1230,3 +1301,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); + +/** + * sync_inode_metadata - write an inode to disk + * @inode: the inode to sync + * @wait: wait for I/O to complete. + * + * Write an inode to disk and adjust its dirty state after completion. + * + * Note: only writes the actual inode, no associated data or other metadata. + */ +int sync_inode_metadata(struct inode *inode, int wait) +{ + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .nr_to_write = 0, /* metadata-only */ + }; + + return sync_inode(inode, &wbc); +} +EXPORT_SYMBOL(sync_inode_metadata); |