summaryrefslogtreecommitdiff
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c197
1 files changed, 144 insertions, 53 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1e23c33ea5cf..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,13 +79,14 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
return sb->s_bdi;
}
-static void bdi_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
+static inline struct inode *wb_inode(struct list_head *head)
{
- trace_writeback_queue(bdi, work);
+ return list_entry(head, struct inode, i_wb_list);
+}
- spin_lock_bh(&bdi->wb_lock);
- list_add_tail(&work->list, &bdi->work_list);
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+{
if (bdi->wb.task) {
wake_up_process(bdi->wb.task);
} else {
@@ -93,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
* The bdi thread isn't there, wake up the forker thread which
* will create and run it.
*/
- trace_writeback_nothread(bdi, work);
wake_up_process(default_backing_dev_info.wb.task);
}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+{
+ trace_writeback_queue(bdi, work);
+
+ spin_lock_bh(&bdi->wb_lock);
+ list_add_tail(&work->list, &bdi->work_list);
+ if (!bdi->wb.task)
+ trace_writeback_nothread(bdi, work);
+ bdi_wakeup_flusher(bdi);
spin_unlock_bh(&bdi->wb_lock);
}
static void
__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic, bool for_background)
+ bool range_cyclic)
{
struct wb_writeback_work *work;
@@ -121,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->sync_mode = WB_SYNC_NONE;
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
- work->for_background = for_background;
bdi_queue_work(bdi, work);
}
@@ -139,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
*/
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
{
- __bdi_start_writeback(bdi, nr_pages, true, false);
+ __bdi_start_writeback(bdi, nr_pages, true);
}
/**
@@ -147,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
* @bdi: the backing device to write from
*
* Description:
- * This does WB_SYNC_NONE background writeback. The IO is only
- * started when this function returns, we make no guarentees on
- * completion. Caller need not hold sb s_umount semaphore.
+ * This makes sure WB_SYNC_NONE background writeback happens. When
+ * this function returns, it is only guaranteed that for given BDI
+ * some IO is happening if we are over background dirty threshold.
+ * Caller need not hold sb s_umount semaphore.
*/
void bdi_start_background_writeback(struct backing_dev_info *bdi)
{
- __bdi_start_writeback(bdi, LONG_MAX, true, true);
+ /*
+ * We just wake up the flusher thread. It will perform background
+ * writeback as soon as there is no other work to do.
+ */
+ trace_writeback_wake_background(bdi);
+ spin_lock_bh(&bdi->wb_lock);
+ bdi_wakeup_flusher(bdi);
+ spin_unlock_bh(&bdi->wb_lock);
}
/*
@@ -172,11 +191,11 @@ static void redirty_tail(struct inode *inode)
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
- tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ tail = wb_inode(wb->b_dirty.next);
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &wb->b_dirty);
+ list_move(&inode->i_wb_list, &wb->b_dirty);
}
/*
@@ -186,7 +205,7 @@ static void requeue_io(struct inode *inode)
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- list_move(&inode->i_list, &wb->b_more_io);
+ list_move(&inode->i_wb_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -227,14 +246,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0;
while (!list_empty(delaying_queue)) {
- inode = list_entry(delaying_queue->prev, struct inode, i_list);
+ inode = wb_inode(delaying_queue->prev);
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
- list_move(&inode->i_list, &tmp);
+ list_move(&inode->i_wb_list, &tmp);
}
/* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +264,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/* Move inodes from one superblock together */
while (!list_empty(&tmp)) {
- inode = list_entry(tmp.prev, struct inode, i_list);
- sb = inode->i_sb;
+ sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
- inode = list_entry(pos, struct inode, i_list);
+ inode = wb_inode(pos);
if (inode->i_sb == sb)
- list_move(&inode->i_list, dispatch_queue);
+ list_move(&inode->i_wb_list, dispatch_queue);
}
}
}
@@ -408,16 +426,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* completion.
*/
redirty_tail(inode);
- } else if (atomic_read(&inode->i_count)) {
- /*
- * The inode is clean, inuse
- */
- list_move(&inode->i_list, &inode_in_use);
} else {
/*
- * The inode is clean, unused
+ * The inode is clean. At this point we either have
+ * a reference to the inode or it's on it's way out.
+ * No need to add it back to the LRU.
*/
- list_move(&inode->i_list, &inode_unused);
+ list_del_init(&inode->i_wb_list);
}
}
inode_sync_complete(inode);
@@ -465,8 +480,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
{
while (!list_empty(&wb->b_io)) {
long pages_skipped;
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ struct inode *inode = wb_inode(wb->b_io.prev);
if (inode->i_sb != sb) {
if (only_this_sb) {
@@ -487,10 +501,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
return 0;
}
- if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+ /*
+ * Don't bother with new inodes or inodes beeing freed, first
+ * kind does not need peridic writeout yet, and for the latter
+ * kind writeout is handled by the freer.
+ */
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
requeue_io(inode);
continue;
}
+
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -498,7 +518,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
if (inode_dirtied_after(inode, wbc->wb_start))
return 1;
- BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
@@ -536,8 +555,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
queue_io(wb, wbc->older_than_this);
while (!list_empty(&wb->b_io)) {
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +600,7 @@ static inline bool over_bground_thresh(void)
global_dirty_limits(&background_thresh, &dirty_thresh);
return (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+ global_page_state(NR_UNSTABLE_NFS) > background_thresh);
}
/*
@@ -612,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
};
unsigned long oldest_jif;
long wrote = 0;
+ long write_chunk;
struct inode *inode;
if (wbc.for_kupdate) {
@@ -624,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
wbc.range_end = LLONG_MAX;
}
+ /*
+ * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+ * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+ * here avoids calling into writeback_inodes_wb() more than once.
+ *
+ * The intended call sequence for WB_SYNC_ALL writeback is:
+ *
+ * wb_writeback()
+ * __writeback_inodes_sb() <== called only once
+ * write_cache_pages() <== called once for each inode
+ * (quickly) tag currently dirty pages
+ * (maybe slowly) sync all tagged pages
+ */
+ if (wbc.sync_mode == WB_SYNC_NONE)
+ write_chunk = MAX_WRITEBACK_PAGES;
+ else
+ write_chunk = LONG_MAX;
+
wbc.wb_start = jiffies; /* livelock avoidance */
for (;;) {
/*
@@ -633,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
/*
+ * Background writeout and kupdate-style writeback may
+ * run forever. Stop them if there is other work to do
+ * so that e.g. sync can proceed. They'll be restarted
+ * after the other works are all done.
+ */
+ if ((work->for_background || work->for_kupdate) &&
+ !list_empty(&wb->bdi->work_list))
+ break;
+
+ /*
* For background writeout, stop when we are below the
* background dirty threshold
*/
@@ -640,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
wbc.more_io = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -650,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
writeback_inodes_wb(wb, &wbc);
trace_wbc_writeback_written(&wbc, wb->bdi);
- work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ work->nr_pages -= write_chunk - wbc.nr_to_write;
+ wrote += write_chunk - wbc.nr_to_write;
/*
* If we consumed everything, see if we have more
@@ -666,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Did we write something? Try for more
*/
- if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ if (wbc.nr_to_write < write_chunk)
continue;
/*
* Nothing written. Wait for some inode to
@@ -675,8 +722,7 @@ static long wb_writeback(struct bdi_writeback *wb,
*/
spin_lock(&inode_lock);
if (!list_empty(&wb->b_more_io)) {
- inode = list_entry(wb->b_more_io.prev,
- struct inode, i_list);
+ inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
inode_wait_for_writeback(inode);
}
@@ -704,6 +750,34 @@ get_next_work_item(struct backing_dev_info *bdi)
return work;
}
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+ return global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ get_nr_dirty_inodes();
+}
+
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+ if (over_bground_thresh()) {
+
+ struct wb_writeback_work work = {
+ .nr_pages = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .for_background = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &work);
+ }
+
+ return 0;
+}
+
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
unsigned long expired;
@@ -721,9 +795,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
return 0;
wb->last_old_flush = jiffies;
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ nr_pages = get_nr_dirty_pages();
if (nr_pages) {
struct wb_writeback_work work = {
@@ -775,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
+ wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
@@ -790,7 +863,7 @@ int bdi_writeback_thread(void *data)
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ current->flags |= PF_SWAPWRITE;
set_freezable();
wb->last_active = jiffies;
@@ -861,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false, false);
+ __bdi_start_writeback(bdi, nr_pages, false);
}
rcu_read_unlock();
}
@@ -962,7 +1035,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* dirty list. Add blockdev inodes as well.
*/
if (!S_ISBLK(inode->i_mode)) {
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
goto out;
}
if (inode->i_state & I_FREEING)
@@ -990,7 +1063,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &bdi->wb.b_dirty);
+ list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
}
}
out:
@@ -1103,9 +1176,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
*/
void writeback_inodes_sb(struct super_block *sb)
{
- return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused));
+ return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
}
EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1154,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
* @sb: the superblock
*
* This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
*/
void sync_inodes_sb(struct super_block *sb)
{
@@ -1230,3 +1301,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode_metadata - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust its dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+ struct writeback_control wbc = {
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+ .nr_to_write = 0, /* metadata-only */
+ };
+
+ return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);