From a88a341a73be4ef035ca26170c849f002797da27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:28 -0400 Subject: writeback: move bandwidth related fields from backing_dev_info into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bandwidth related fields from backing_dev_info into bdi_writeback. * The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp, write_bandwidth, avg_write_bandwidth, dirty_ratelimit, balanced_dirty_ratelimit, completions and dirty_exceeded. * writeback_chunk_size() and over_bground_thresh() now take @wb instead of @bdi. * bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...) bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...) bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...) bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...) [__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...) bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...) bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...) * Init/exits of the relocated fields are moved to bdi_wb_init/exit() respectively. Note that explicit zeroing is dropped in the process as wb's are cleared in entirety anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. v2: Typo in description fixed as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Jaegeuk Kim Cc: Steven Whitehouse Signed-off-by: Jens Axboe --- include/linux/writeback.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b2dd371ec0ca..a6b9db7fcee8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,16 +155,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty); - -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); + +void __wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); -- cgit v1.2.3 From f30a7d0cc8d9096d6728fadd0ab024e648010ec0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:14:00 -0400 Subject: writeback: restructure try_writeback_inodes_sb[_nr]() try_writeback_inodes_sb_nr() wraps writeback_inodes_sb_nr() so that it handles s_umount locking and skips if writeback is already in progress. The in progress test is performed on the root wb (bdi_writeback) which isn't sufficient for cgroup writeback support. The test must be done per-wb. To prepare for the change, this patch factors out __writeback_inodes_sb_nr() from writeback_inodes_sb_nr() and adds @skip_if_busy and moves the in progress test right before queueing the wb_writeback_work. try_writeback_inodes_sb_nr() now just grabs s_umount and invokes __writeback_inodes_sb_nr() with asserted @skip_if_busy. This way, later addition of multiple wb handling can skip only the wb's which already have writeback in progress. This swaps the order between in progress test and s_umount test which can flip the return value when writeback is in progress and s_umount is being held by someone else but this shouldn't cause any meaningful difference. It's a fringe condition and the return value is an unsynchronized hint anyway. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 52 ++++++++++++++++++++++++++--------------------- include/linux/writeback.h | 6 +++--- 2 files changed, 32 insertions(+), 26 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 093b9594e846..0039c5839cdd 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1581,19 +1581,8 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } -/** - * writeback_inodes_sb_nr - writeback dirty inodes from given super_block - * @sb: the superblock - * @nr: the number of pages to write - * @reason: reason why some writeback work initiated - * - * Start writeback on some inodes on this super_block. No guarantees are made - * on how many (if any) will be written, and this function does not wait - * for IO completion of submitted IO. - */ -void writeback_inodes_sb_nr(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason, bool skip_if_busy) { DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { @@ -1609,9 +1598,30 @@ void writeback_inodes_sb_nr(struct super_block *sb, if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + if (skip_if_busy && writeback_in_progress(&bdi->wb)) + return; + wb_queue_work(&bdi->wb, &work); wb_wait_for_completion(bdi, &done); } + +/** + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block + * @sb: the superblock + * @nr: the number of pages to write + * @reason: reason why some writeback work initiated + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) +{ + __writeback_inodes_sb_nr(sb, nr, reason, false); +} EXPORT_SYMBOL(writeback_inodes_sb_nr); /** @@ -1638,19 +1648,15 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb_nr(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason) { - if (writeback_in_progress(&sb->s_bdi->wb)) - return 1; - if (!down_read_trylock(&sb->s_umount)) - return 0; + return false; - writeback_inodes_sb_nr(sb, nr, reason); + __writeback_inodes_sb_nr(sb, nr, reason, true); up_read(&sb->s_umount); - return 1; + return true; } EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); @@ -1662,7 +1668,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a6b9db7fcee8..23af355d5471 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -93,9 +93,9 @@ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); +bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, + enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); -- cgit v1.2.3 From 0d960a383ae7aa791b2833e122ba7519d264cf92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:19 -0400 Subject: writeback: clean up wb_dirty_limit() The function name wb_dirty_limit(), its argument @dirty and the local variable @wb_dirty are mortally confusing given that the function calculates per-wb threshold value not dirty pages, especially given that @dirty and @wb_dirty are used elsewhere for dirty pages. Let's rename the function to wb_calc_thresh() and wb_dirty to wb_thresh. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 2 +- mm/backing-dev.c | 6 +++--- mm/page-writeback.c | 30 +++++++++++++++--------------- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 881ea5d97c00..b1b3b8184500 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1081,7 +1081,7 @@ static bool over_bground_thresh(struct bdi_writeback *wb) global_page_state(NR_UNSTABLE_NFS) > background_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > wb_dirty_limit(wb, background_thresh)) + if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh)) return true; return false; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 23af355d5471..0435c85d4cfa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,7 +155,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void __wb_update_bandwidth(struct bdi_writeback *wb, unsigned long thresh, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ad5608d01e8c..9c8b7b5a0eee 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -49,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; + unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; @@ -67,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); - bdi_thresh = wb_dirty_limit(wb, dirty_thresh); + wb_thresh = wb_calc_thresh(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -87,7 +87,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "state: %10lx\n", (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), - K(bdi_thresh), + K(wb_thresh), K(dirty_thresh), K(background_thresh), (unsigned long) K(wb_stat(wb, WB_DIRTIED)), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 70cf98dc3423..c7745a7fe11e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -556,7 +556,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh) } /** - * wb_dirty_limit - @wb's share of dirty throttling threshold + * wb_calc_thresh - @wb's share of dirty throttling threshold * @wb: bdi_writeback to query * @dirty: global dirty limit in pages * @@ -577,28 +577,28 @@ static unsigned long hard_dirty_limit(unsigned long thresh) * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ -unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty) +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { - u64 wb_dirty; + u64 wb_thresh; long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the dirty ratio. + * Calculate this BDI's share of the thresh ratio. */ wb_writeout_fraction(wb, &numerator, &denominator); - wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - wb_dirty *= numerator; - do_div(wb_dirty, denominator); + wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh *= numerator; + do_div(wb_thresh, denominator); wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); - wb_dirty += (dirty * wb_min_ratio) / 100; - if (wb_dirty > (dirty * wb_max_ratio) / 100) - wb_dirty = dirty * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / 100; + if (wb_thresh > (thresh * wb_max_ratio) / 100) + wb_thresh = thresh * wb_max_ratio / 100; - return wb_dirty; + return wb_thresh; } /* @@ -750,7 +750,7 @@ static unsigned long wb_position_ratio(struct bdi_writeback *wb, * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. - * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. @@ -1115,7 +1115,7 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, * * We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity - * of backing device (see the implementation of wb_dirty_limit()). + * of backing device (see the implementation of wb_calc_thresh()). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = wb_dirty; @@ -1123,7 +1123,7 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, setpoint = wb_dirty + 1; else setpoint = (wb_thresh + - wb_dirty_limit(wb, bg_thresh)) / 2; + wb_calc_thresh(wb, bg_thresh)) / 2; } if (dirty < setpoint) { @@ -1352,7 +1352,7 @@ static inline void wb_dirty_limits(struct bdi_writeback *wb, * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - *wb_thresh = wb_dirty_limit(wb, dirty_thresh); + *wb_thresh = wb_calc_thresh(wb, dirty_thresh); if (wb_bg_thresh) *wb_bg_thresh = dirty_thresh ? div_u64((u64)*wb_thresh * -- cgit v1.2.3 From 8a73179956e649df0d4b3250db17734f272d8266 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:20 -0400 Subject: writeback: reorganize [__]wb_update_bandwidth() __wb_update_bandwidth() is called from two places - fs/fs-writeback.c::balance_dirty_pages() and mm/page-writeback.c::wb_writeback(). The latter updates only the write bandwidth while the former also deals with the dirty ratelimit. The two callsites are distinguished by whether @thresh parameter is zero or not, which is cryptic. In addition, the two files define their own different versions of wb_update_bandwidth() on top of __wb_update_bandwidth(), which is confusing to say the least. This patch cleans up [__]wb_update_bandwidth() in the following ways. * __wb_update_bandwidth() now takes explicit @update_ratelimit parameter to gate dirty ratelimit handling. * mm/page-writeback.c::wb_update_bandwidth() is flattened into its caller - balance_dirty_pages(). * fs/fs-writeback.c::wb_update_bandwidth() is moved to mm/page-writeback.c and __wb_update_bandwidth() is made static. * While at it, add a lockdep assertion to __wb_update_bandwidth(). Except for the lockdep addition, this is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 10 ---------- include/linux/writeback.h | 9 +-------- mm/page-writeback.c | 45 ++++++++++++++++++++++----------------------- 3 files changed, 23 insertions(+), 41 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b1b3b8184500..cd89484486f6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1087,16 +1087,6 @@ static bool over_bground_thresh(struct bdi_writeback *wb) return false; } -/* - * Called under wb->list_lock. If there are multiple wb per bdi, - * only the flusher working on the first wb should do it. - */ -static void wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long start_time) -{ - __wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time); -} - /* * Explicit flushing or periodic writeback of "old" data. * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0435c85d4cfa..80adf3d88d9d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -157,14 +157,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void __wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); - +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c7745a7fe11e..bebdd41b8d8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1160,19 +1160,22 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); } -void __wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long wb_thresh, - unsigned long wb_dirty, - unsigned long start_time) +static void __wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long wb_thresh, + unsigned long wb_dirty, + unsigned long start_time, + bool update_ratelimit) { unsigned long now = jiffies; unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; + lockdep_assert_held(&wb->list_lock); + /* * rate-limit, only update once every 200ms. */ @@ -1189,7 +1192,7 @@ void __wb_update_bandwidth(struct bdi_writeback *wb, if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; - if (thresh) { + if (update_ratelimit) { global_update_bandwidth(thresh, dirty, now); wb_update_dirty_ratelimit(wb, thresh, bg_thresh, dirty, wb_thresh, wb_dirty, @@ -1203,20 +1206,9 @@ snapshot: wb->bw_time_stamp = now; } -static void wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long wb_thresh, - unsigned long wb_dirty, - unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - if (time_is_after_eq_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) - return; - spin_lock(&wb->list_lock); - __wb_update_bandwidth(wb, thresh, bg_thresh, dirty, - wb_thresh, wb_dirty, start_time); - spin_unlock(&wb->list_lock); + __wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time, false); } /* @@ -1467,8 +1459,15 @@ static void balance_dirty_pages(struct address_space *mapping, if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; - wb_update_bandwidth(wb, dirty_thresh, background_thresh, - nr_dirty, wb_thresh, wb_dirty, start_time); + if (time_is_before_jiffies(wb->bw_time_stamp + + BANDWIDTH_INTERVAL)) { + spin_lock(&wb->list_lock); + __wb_update_bandwidth(wb, dirty_thresh, + background_thresh, nr_dirty, + wb_thresh, wb_dirty, start_time, + true); + spin_unlock(&wb->list_lock); + } dirty_ratelimit = wb->dirty_ratelimit; pos_ratio = wb_position_ratio(wb, dirty_thresh, -- cgit v1.2.3 From 380c27ca33ebecc9da35aa90c8b3a9154f90aac2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:21 -0400 Subject: writeback: implement wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. Currently, what constitutes the global writeback domain are scattered across a number of global states. This patch starts collecting them into struct wb_domain. * fprop_global which serves as the basis for proportional bandwidth measurement and its period timer are moved into struct wb_domain. * global_wb_domain hosts the states for the global domain. * While at it, flatten wb_writeout_fraction() into its callers. This thin wrapper doesn't provide any actual benefits while getting in the way. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 32 +++++++++++++++++++++ mm/page-writeback.c | 72 ++++++++++++++++++----------------------------- 2 files changed, 59 insertions(+), 45 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 80adf3d88d9d..3148db1296a2 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -86,6 +87,36 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ }; +/* + * A wb_domain represents a domain that wb's (bdi_writeback's) belong to + * and are measured against each other in. There always is one global + * domain, global_wb_domain, that every wb in the system is a member of. + * This allows measuring the relative bandwidth of each wb to distribute + * dirtyable memory accordingly. + */ +struct wb_domain { + /* + * Scale the writeback cache size proportional to the relative + * writeout speed. + * + * We do this by keeping a floating proportion between BDIs, based + * on page writeback completions [end_page_writeback()]. Those + * devices that write out pages fastest will get the larger share, + * while the slower will get a smaller share. + * + * We use page writeout completions because we are interested in + * getting rid of dirty pages. Having them written out is the + * primary goal. + * + * We introduce a concept of time, a period over which we measure + * these events, because demand can/will vary over time. The length + * of this period itself is measured in page writeback completions. + */ + struct fprop_global completions; + struct timer_list period_timer; /* timer for aging of completions */ + unsigned long period_time; +}; + /* * fs/fs-writeback.c */ @@ -120,6 +151,7 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); +int wb_domain_init(struct wb_domain *dom, gfp_t gfp); extern unsigned long global_dirty_limit; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bebdd41b8d8e..08e1737edb39 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -124,29 +124,7 @@ EXPORT_SYMBOL(laptop_mode); unsigned long global_dirty_limit; -/* - * Scale the writeback cache size proportional to the relative writeout speeds. - * - * We do this by keeping a floating proportion between BDIs, based on page - * writeback completions [end_page_writeback()]. Those devices that write out - * pages fastest will get the larger share, while the slower will get a smaller - * share. - * - * We use page writeout completions because we are interested in getting rid of - * dirty pages. Having them written out is the primary goal. - * - * We introduce a concept of time, a period over which we measure these events, - * because demand can/will vary over time. The length of this period itself is - * measured in page writeback completions. - * - */ -static struct fprop_global writeout_completions; - -static void writeout_period(unsigned long t); -/* Timer for aging of writeout_completions */ -static struct timer_list writeout_period_timer = - TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); -static unsigned long writeout_period_time = 0; +static struct wb_domain global_wb_domain; /* * Length of period for aging writeout fractions of bdis. This is an @@ -433,24 +411,26 @@ static unsigned long wp_next_time(unsigned long cur_time) } /* - * Increment the BDI's writeout completion count and the global writeout + * Increment the wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { + struct wb_domain *dom = &global_wb_domain; + __inc_wb_stat(wb, WB_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &wb->completions, + __fprop_inc_percpu_max(&dom->completions, &wb->completions, wb->bdi->max_prop_frac); /* First event after period switching was turned off? */ - if (!unlikely(writeout_period_time)) { + if (!unlikely(dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ - writeout_period_time = wp_next_time(jiffies); - mod_timer(&writeout_period_timer, writeout_period_time); + dom->period_time = wp_next_time(jiffies); + mod_timer(&dom->period_timer, dom->period_time); } } @@ -464,38 +444,38 @@ void wb_writeout_inc(struct bdi_writeback *wb) } EXPORT_SYMBOL_GPL(wb_writeout_inc); -/* - * Obtain an accurate fraction of the BDI's portion. - */ -static void wb_writeout_fraction(struct bdi_writeback *wb, - long *numerator, long *denominator) -{ - fprop_fraction_percpu(&writeout_completions, &wb->completions, - numerator, denominator); -} - /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(unsigned long t) { - int miss_periods = (jiffies - writeout_period_time) / + struct wb_domain *dom = (void *)t; + int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; - if (fprop_new_period(&writeout_completions, miss_periods + 1)) { - writeout_period_time = wp_next_time(writeout_period_time + + if (fprop_new_period(&dom->completions, miss_periods + 1)) { + dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); - mod_timer(&writeout_period_timer, writeout_period_time); + mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ - writeout_period_time = 0; + dom->period_time = 0; } } +int wb_domain_init(struct wb_domain *dom, gfp_t gfp) +{ + memset(dom, 0, sizeof(*dom)); + init_timer_deferrable(&dom->period_timer); + dom->period_timer.function = writeout_period; + dom->period_timer.data = (unsigned long)dom; + return fprop_global_init(&dom->completions, gfp); +} + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not @@ -579,6 +559,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh) */ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { + struct wb_domain *dom = &global_wb_domain; u64 wb_thresh; long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; @@ -586,7 +567,8 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) /* * Calculate this BDI's share of the thresh ratio. */ - wb_writeout_fraction(wb, &numerator, &denominator); + fprop_fraction_percpu(&dom->completions, &wb->completions, + &numerator, &denominator); wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; @@ -1831,7 +1813,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions, GFP_KERNEL); + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); } /** -- cgit v1.2.3 From dcc25ae76eb7b8ff883eaaab57e30e8f2f085be3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:22 -0400 Subject: writeback: move global_dirty_limit into wb_domain This patch is a part of the series to define wb_domain which represents a domain that wb's (bdi_writeback's) belong to and are measured against each other in. This will enable IO backpressure propagation for cgroup writeback. global_dirty_limit exists to regulate the global dirty threshold which is a property of the wb_domain. This patch moves hard_dirty_limit, dirty_lock, and update_time into wb_domain. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 17 ++++++++++++++- include/trace/events/writeback.h | 7 +++--- mm/page-writeback.c | 46 ++++++++++++++++++++-------------------- 4 files changed, 44 insertions(+), 28 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cd89484486f6..51c8a5b14cdf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -887,7 +887,7 @@ static long writeback_chunk_size(struct bdi_writeback *wb, pages = LONG_MAX; else { pages = min(wb->avg_write_bandwidth / 2, - global_dirty_limit / DIRTY_SCOPE); + global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3148db1296a2..5fdd4e1805e6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -95,6 +95,8 @@ struct writeback_control { * dirtyable memory accordingly. */ struct wb_domain { + spinlock_t lock; + /* * Scale the writeback cache size proportional to the relative * writeout speed. @@ -115,6 +117,19 @@ struct wb_domain { struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; + + /* + * The dirtyable memory and dirty threshold could be suddenly + * knocked down by a large amount (eg. on the startup of KVM in a + * swapless system). This may throw the system into deep dirty + * exceeded state and throttle heavy/light dirtiers alike. To + * retain good responsiveness, maintain global_dirty_limit for + * tracking slowly down to the knocked down dirty threshold. + * + * Both fields are protected by ->lock. + */ + unsigned long dirty_limit_tstamp; + unsigned long dirty_limit; }; /* @@ -153,7 +168,7 @@ void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); -extern unsigned long global_dirty_limit; +extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern int dirty_background_ratio; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 9b876f6cc81a..bec69995968f 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -361,7 +361,7 @@ TRACE_EVENT(global_dirty_state, __entry->nr_written = global_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; - __entry->dirty_limit = global_dirty_limit; + __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu unstable=%lu " @@ -463,8 +463,9 @@ TRACE_EVENT(balance_dirty_pages, unsigned long freerun = (thresh + bg_thresh) / 2; strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->limit = global_dirty_limit; - __entry->setpoint = (global_dirty_limit + freerun) / 2; + __entry->limit = global_wb_domain.dirty_limit; + __entry->setpoint = (global_wb_domain.dirty_limit + + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 08e1737edb39..27e60ba8e688 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -122,9 +122,7 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -unsigned long global_dirty_limit; - -static struct wb_domain global_wb_domain; +struct wb_domain global_wb_domain; /* * Length of period for aging writeout fractions of bdis. This is an @@ -470,9 +468,15 @@ static void writeout_period(unsigned long t) int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); + + spin_lock_init(&dom->lock); + init_timer_deferrable(&dom->period_timer); dom->period_timer.function = writeout_period; dom->period_timer.data = (unsigned long)dom; + + dom->dirty_limit_tstamp = jiffies; + return fprop_global_init(&dom->completions, gfp); } @@ -532,7 +536,9 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh, static unsigned long hard_dirty_limit(unsigned long thresh) { - return max(thresh, global_dirty_limit); + struct wb_domain *dom = &global_wb_domain; + + return max(thresh, dom->dirty_limit); } /** @@ -916,17 +922,10 @@ out: wb->avg_write_bandwidth = avg; } -/* - * The global dirtyable memory and dirty threshold could be suddenly knocked - * down by a large amount (eg. on the startup of KVM in a swapless system). - * This may throw the system into deep dirty exceeded state and throttle - * heavy/light dirtiers alike. To retain good responsiveness, maintain - * global_dirty_limit for tracking slowly down to the knocked down dirty - * threshold. - */ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) { - unsigned long limit = global_dirty_limit; + struct wb_domain *dom = &global_wb_domain; + unsigned long limit = dom->dirty_limit; /* * Follow up in one step. @@ -939,7 +938,7 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce - * global_dirty_limit which is guaranteed to lie above the dirty pages. + * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dirty); if (limit > thresh) { @@ -948,28 +947,27 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) } return; update: - global_dirty_limit = limit; + dom->dirty_limit = limit; } static void global_update_bandwidth(unsigned long thresh, unsigned long dirty, unsigned long now) { - static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time = INITIAL_JIFFIES; + struct wb_domain *dom = &global_wb_domain; /* * check locklessly first to optimize away locking for the most time */ - if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; - spin_lock(&dirty_lock); - if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { + spin_lock(&dom->lock); + if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(thresh, dirty); - update_time = now; + dom->dirty_limit_tstamp = now; } - spin_unlock(&dirty_lock); + spin_unlock(&dom->lock); } /* @@ -1761,10 +1759,12 @@ void laptop_sync_completion(void) void writeback_set_ratelimit(void) { + struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; + global_dirty_limits(&background_thresh, &dirty_thresh); - global_dirty_limit = dirty_thresh; + dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; -- cgit v1.2.3 From aa661bbe1e61ce80ca4ae98804f673ede94b0827 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:31 -0400 Subject: writeback: move over_bground_thresh() to mm/page-writeback.c and rename it to wb_over_bg_thresh(). The function is closely tied to the dirty throttling mechanism implemented in page-writeback.c. This relocation will allow future updates necessary for cgroup writeback support. While at it, add function comment. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 20 ++------------------ include/linux/writeback.h | 1 + mm/page-writeback.c | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 18 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 51c8a5b14cdf..da355879ba7c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1071,22 +1071,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, return nr_pages - work.nr_pages; } -static bool over_bground_thresh(struct bdi_writeback *wb) -{ - unsigned long background_thresh, dirty_thresh; - - global_dirty_limits(&background_thresh, &dirty_thresh); - - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh) - return true; - - if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh)) - return true; - - return false; -} - /* * Explicit flushing or periodic writeback of "old" data. * @@ -1136,7 +1120,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb)) + if (work->for_background && !wb_over_bg_thresh(wb)) break; /* @@ -1227,7 +1211,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb)) { + if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5fdd4e1805e6..b57c2786b5aa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,6 +207,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); +bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c8ac8cea67dc..9d9a896fa7b5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1740,6 +1740,29 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. Returns %true if writeback should continue. + */ +bool wb_over_bg_thresh(struct bdi_writeback *wb) +{ + unsigned long background_thresh, dirty_thresh; + + global_dirty_limits(&background_thresh, &dirty_thresh); + + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh)) + return true; + + return false; +} + void throttle_vm_writeout(gfp_t gfp_mask) { unsigned long background_thresh; -- cgit v1.2.3 From 841710aa6e4acd066ab9fe8c8cb6f4e4e6709d83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:33 -0400 Subject: writeback: implement memcg wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. The previous patches laid the groundwork to support the two wb_domains and this patch implements memcg wb_domain. memcg->cgwb_domain is initialized on css online and destroyed on css release, wb->memcg_completions is added, and __wb_writeout_inc() is updated to increment completions against both global and memcg wb_domains. The following patches will update balance_dirty_pages() and its subroutines to actually consider memcg wb_domain for throttling. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/memcontrol.h | 12 +++++++++++- include/linux/writeback.h | 3 +++ mm/backing-dev.c | 9 ++++++++- mm/memcontrol.c | 39 +++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 25 +++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 2 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 97a92fa0cdb5..8d470b73824f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -118,6 +118,7 @@ struct bdi_writeback { #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ + struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 662a953ea8ad..e3177bed23ea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -389,8 +389,18 @@ enum { }; #ifdef CONFIG_CGROUP_WRITEBACK + struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); -#endif +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b57c2786b5aa..04a3786c456f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -167,6 +167,9 @@ static inline void laptop_sync_completion(void) { } void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom); +#endif extern struct wb_domain global_wb_domain; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9c8b7b5a0eee..84ebf7c8d006 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -482,6 +482,7 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); wb_congested_put(wb->congested); + fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); kfree_rcu(wb, rcu); @@ -548,9 +549,13 @@ static int cgwb_create(struct backing_dev_info *bdi, if (ret) goto err_wb_exit; + ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); + if (ret) + goto err_ref_exit; + wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); if (!wb->congested) - goto err_ref_exit; + goto err_fprop_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; @@ -587,6 +592,8 @@ static int cgwb_create(struct backing_dev_info *bdi, err_put_congested: wb_congested_put(wb->congested); +err_fprop_exit: + fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 701cbee9acba..ce113ddf2fb5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -345,6 +345,7 @@ struct mem_cgroup { #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; + struct wb_domain cgwb_domain; #endif /* List of events which userspace want to receive */ @@ -3994,6 +3995,37 @@ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) return &memcg->cgwb_list; } +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* @@ -4380,9 +4412,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto out_free; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + spin_lock_init(&memcg->pcp_counter_lock); return memcg; +out_free_stat: + free_percpu(memcg->stat); out_free: kfree(memcg); return NULL; @@ -4409,6 +4447,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); + memcg_wb_domain_exit(memcg); kfree(memcg); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a7ba5cee950b..a146e3389e78 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -171,6 +171,11 @@ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *m return mdtc->gdtc; } +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { @@ -213,6 +218,11 @@ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *m return NULL; } +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { @@ -530,9 +540,16 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { + struct wb_domain *cgdom; + __inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac); } void wb_writeout_inc(struct bdi_writeback *wb) @@ -583,6 +600,14 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) return fprop_global_init(&dom->completions, gfp); } +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + del_timer_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not -- cgit v1.2.3 From 2529bb3aadc40a93e642f5f3650f63379a964467 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:34 -0400 Subject: writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes The amount of available memory to a memcg wb_domain can change as memcg configuration changes. A domain's ->dirty_limit exists to smooth out sudden drops in dirty threshold; however, when a domain's size actually drops significantly, it hinders the dirty throttling from adjusting to the new configuration leading to unexpected behaviors including unnecessary OOM kills. This patch resolves the issue by adding wb_domain_size_changed() which resets ->dirty_limit[_tstmp] and making memcg call it on configuration changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/writeback.h | 20 ++++++++++++++++++++ mm/memcontrol.c | 12 ++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 04a3786c456f..3b73e97ecfc7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -132,6 +132,26 @@ struct wb_domain { unsigned long dirty_limit; }; +/** + * wb_domain_size_changed - memory available to a wb_domain has changed + * @dom: wb_domain of interest + * + * This function should be called when the amount of memory available to + * @dom has changed. It resets @dom's dirty limit parameters to prevent + * the past values which don't match the current configuration from skewing + * dirty throttling. Without this, when memory size of a wb_domain is + * greatly reduced, the dirty throttling logic may allow too many pages to + * be dirtied leading to consecutive unnecessary OOMs and may get stuck in + * that situation. + */ +static inline void wb_domain_size_changed(struct wb_domain *dom) +{ + spin_lock(&dom->lock); + dom->dirty_limit_tstamp = jiffies; + dom->dirty_limit = 0; + spin_unlock(&dom->lock); +} + /* * fs/fs-writeback.c */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce113ddf2fb5..c0b0406ae5ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4005,6 +4005,11 @@ static void memcg_wb_domain_exit(struct mem_cgroup *memcg) wb_domain_exit(&memcg->cgwb_domain); } +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); @@ -4026,6 +4031,10 @@ static void memcg_wb_domain_exit(struct mem_cgroup *memcg) { } +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* @@ -4624,6 +4633,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; + memcg_wb_domain_size_changed(memcg); } #ifdef CONFIG_MMU @@ -5361,6 +5371,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5393,6 +5404,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; + memcg_wb_domain_size_changed(memcg); return nbytes; } -- cgit v1.2.3 From 21c6321fbb3a3787af07f1bc031d713a707fb69c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:49 -0400 Subject: writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Currently, majority of cgroup writeback support including all the above functions are implemented in include/linux/backing-dev.h and mm/backing-dev.c; however, the portion closely related to writeback logic implemented in include/linux/writeback.h and mm/page-writeback.c will expand to support foreign writeback detection and correction. This patch moves wb[_try]_get() and wb_put() to include/linux/backing-dev-defs.h so that they can be used from writeback.h and inode_{attach|detach}_wb() to writeback.h and page-writeback.c. This is pure reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 31 +++++++++++++++ include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++ include/linux/backing-dev.h | 82 ---------------------------------------- include/linux/writeback.h | 46 ++++++++++++++++++++++ mm/backing-dev.c | 30 --------------- 5 files changed, 127 insertions(+), 112 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index da355879ba7c..cf6ccfb01e03 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "internal.h" /* @@ -213,6 +214,36 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #ifdef CONFIG_CGROUP_WRITEBACK +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8d470b73824f..e047b496a0b9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e9d7373f5f93..5c978a924157 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); -void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); int inode_congested(struct inode *inode, int cong_bits); @@ -264,37 +263,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } -/** - * wb_tryget - try to increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - return percpu_ref_tryget(&wb->refcnt); - return true; -} - -/** - * wb_get - increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline void wb_get(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_get(&wb->refcnt); -} - -/** - * wb_put - decrement a wb's refcount - * @wb: bdi_writeback to put - */ -static inline void wb_put(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); -} - /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest @@ -353,35 +321,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_attach_wb - associate an inode with its wb - * @inode: inode of interest - * @page: page being dirtied (may be NULL) - * - * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o - * @inode->i_lock. - */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ - if (!inode->i_wb) - __inode_attach_wb(inode, page); -} - -/** - * inode_detach_wb - disassociate an inode from its wb - * @inode: inode of interest - * - * @inode is being freed. Detach from its wb. - */ -static inline void inode_detach_wb(struct inode *inode) -{ - if (inode->i_wb) { - wb_put(inode->i_wb); - inode->i_wb = NULL; - } -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested) { } -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - return true; -} - -static inline void wb_get(struct bdi_writeback *wb) -{ -} - -static inline void wb_put(struct bdi_writeback *wb) -{ -} - static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; @@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ -} - -static inline void inode_detach_wb(struct inode *inode) -{ -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3b73e97ecfc7..6726b7e56beb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -8,6 +8,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } +#ifdef CONFIG_CGROUP_WRITEBACK + +void __inode_attach_wb(struct inode *inode, struct page *page); + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * mm/page-writeback.c */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 84ebf7c8d006..887d72a85b5e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -660,36 +660,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, return wb; } -void __inode_attach_wb(struct inode *inode, struct page *page) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - struct bdi_writeback *wb = NULL; - - if (inode_cgwb_enabled(inode)) { - struct cgroup_subsys_state *memcg_css; - - if (page) { - memcg_css = mem_cgroup_css_from_page(page); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - } else { - /* must pin memcg_css, see wb_get_create() */ - memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - css_put(memcg_css); - } - } - - if (!wb) - wb = &bdi->wb; - - /* - * There may be multiple instances of this function racing to - * update the same inode. Use cmpxchg() to tell the winner. - */ - if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) - wb_put(wb); -} - static void cgwb_bdi_init(struct backing_dev_info *bdi) { bdi->wb.memcg_css = mem_cgroup_root_css; -- cgit v1.2.3 From b16b1deb553adcd7b3b7ce3e6d6fd1b923f314da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Jun 2015 08:39:48 -0600 Subject: writeback: make writeback_control track the inode being written back Currently, for cgroup writeback, the IO submission paths directly associate the bio's with the blkcg from inode_to_wb_blkcg_css(); however, it'd be necessary to keep more writeback context to implement foreign inode writeback detection. wbc (writeback_control) is the natural fit for the extra context - it persists throughout the writeback of each inode and is passed all the way down to IO submission paths. This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and wbc_attach_fdatawrite_inode() which are used to associate wbc with the inode being written back. IO submission paths now use wbc_init_bio() instead of directly associating bio's with blkcg themselves. This leaves inode_to_wb_blkcg_css() w/o any user. The function is removed. wbc currently only tracks the associated wb (bdi_writeback). Future patches will add more for foreign inode detection. The association is established under i_lock which will be depended upon when migrating foreign inodes to other wb's. As currently, once established, inode to wb association never changes, going through wbc when initializing bio's doesn't cause any behavior changes. v2: submit_blk_blkcg() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/buffer.c | 25 ++++++++--------- fs/fs-writeback.c | 37 ++++++++++++++++++++++-- fs/mpage.c | 2 +- include/linux/backing-dev.h | 12 -------- include/linux/writeback.h | 68 +++++++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 2 ++ 6 files changed, 118 insertions(+), 28 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/buffer.c b/fs/buffer.c index b85e94134ea6..d883c799fb45 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,9 +45,9 @@ #include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_blkcg(int rw, struct buffer_head *bh, - unsigned long bio_flags, - struct cgroup_subsys_state *blkcg_css); +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1709,7 +1709,6 @@ static int __block_write_full_page(struct inode *inode, struct page *page, unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - struct cgroup_subsys_state *blkcg_css = inode_to_wb_blkcg_css(inode); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1798,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_blkcg(write_op, bh, 0, blkcg_css); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -1852,7 +1851,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_blkcg(write_op, bh, 0, blkcg_css); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -3017,11 +3016,11 @@ void guard_bio_eod(int rw, struct bio *bio) } } -static int submit_bh_blkcg(int rw, struct buffer_head *bh, - unsigned long bio_flags, - struct cgroup_subsys_state *blkcg_css) +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; + int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); @@ -3041,8 +3040,8 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (blkcg_css) - bio_associate_blkcg(bio, blkcg_css); + if (wbc) + wbc_init_bio(wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; @@ -3071,13 +3070,13 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh, int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { - return submit_bh_blkcg(rw, bh, bio_flags, NULL); + return submit_bh_wbc(rw, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); int submit_bh(int rw, struct buffer_head *bh) { - return submit_bh_blkcg(rw, bh, 0, NULL); + return submit_bh_wbc(rw, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cf6ccfb01e03..755e8ef8d1f0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -244,6 +244,37 @@ void __inode_attach_wb(struct inode *inode, struct page *page) wb_put(wb); } +/** + * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it + * @wbc: writeback_control of interest + * @inode: target inode + * + * @inode is locked and about to be written back under the control of @wbc. + * Record @inode's writeback context into @wbc and unlock the i_lock. On + * writeback completion, wbc_detach_inode() should be called. This is used + * to track the cgroup writeback context. + */ +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) +{ + wbc->wb = inode_to_wb(inode); + wb_get(wbc->wb); + spin_unlock(&inode->i_lock); +} + +/** + * wbc_detach_inode - disassociate wbc from its target inode + * @wbc: writeback_control of interest + * + * To be called after a writeback attempt of an inode finishes and undoes + * wbc_attach_and_unlock_inode(). Can be called under any context. + */ +void wbc_detach_inode(struct writeback_control *wbc) +{ + wb_put(wbc->wb); + wbc->wb = NULL; +} + /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion @@ -877,10 +908,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); + wbc_detach_inode(wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); /* @@ -1013,7 +1045,7 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(&wbc, inode); write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; @@ -1025,6 +1057,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); diff --git a/fs/mpage.c b/fs/mpage.c index a3ccb0bd465a..388fde6ac255 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: if (bio == NULL) goto confused; - bio_associate_blkcg(bio, inode_to_wb_blkcg_css(inode)); + wbc_init_bio(wbc, bio); } /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5c978a924157..b1d2489a6536 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return inode_to_wb(inode)->blkcg_css; -} - struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline struct cgroup_subsys_state * -inode_to_wb_blkcg_css(struct inode *inode) -{ - return blkcg_root_css; -} - struct wb_iter { int next_id; }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6726b7e56beb..8f964e558af5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -86,6 +86,9 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *wb; /* wb this writeback is issued under */ +#endif }; /* @@ -176,7 +179,14 @@ static inline void wait_on_inode(struct inode *inode) #ifdef CONFIG_CGROUP_WRITEBACK +#include +#include + void __inode_attach_wb(struct inode *inode, struct page *page); +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock); +void wbc_detach_inode(struct writeback_control *wbc); /** * inode_attach_wb - associate an inode with its wb @@ -207,6 +217,44 @@ static inline void inode_detach_wb(struct inode *inode) } } +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} + +/** + * wbc_init_bio - writeback specific initializtion of bio + * @wbc: writeback_control for the writeback in progress + * @bio: bio to be initialized + * + * @bio is a part of the writeback in progress controlled by @wbc. Perform + * writeback specific initialization. This is used to apply the cgroup + * writeback context. + */ +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (wbc->wb) + bio_associate_blkcg(bio, wbc->wb->blkcg_css); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct page *page) @@ -217,6 +265,26 @@ static inline void inode_detach_wb(struct inode *inode) { } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ +} + +static inline void wbc_detach_inode(struct writeback_control *wbc) +{ +} + +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/mm/filemap.c b/mm/filemap.c index 7b1443dc3ad0..2f065b19b635 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -290,7 +290,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); return ret; } -- cgit v1.2.3 From 2a81490811d0296d390c571bb64eaa93e5ed7def Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:51 -0400 Subject: writeback: implement foreign cgroup inode detection As concurrent write sharing of an inode is expected to be very rare and memcg only tracks page ownership on first-use basis severely confining the usefulness of such sharing, cgroup writeback tracks ownership per-inode. While the support for concurrent write sharing of an inode is deemed unnecessary, an inode being written to by different cgroups at different points in time is a lot more common, and, more importantly, charging only by first-use can too readily lead to grossly incorrect behaviors (single foreign page can lead to gigabytes of writeback to be incorrectly attributed). To resolve this issue, cgroup writeback detects the majority dirtier of an inode and will transfer the ownership to it. To avoid unnnecessary oscillation, the detection mechanism keeps track of history and gives out the switch verdict only if the foreign usage pattern is stable over a certain amount of time and/or writeback attempts. The detection mechanism has fairly low space and computation overhead. It adds 8 bytes to struct inode (one int and two u16's) and minimal amount of calculation per IO. The detection mechanism converges to the correct answer usually in several seconds of IO time when there's a clear majority dirtier. Even when there isn't, it can reach an acceptable answer fairly quickly under most circumstances. Please see wb_detach_inode() for more details. This patch only implements detection. Following patches will implement actual switching. v2: wbc_account_io() now checks whether the wbc is associated with a wb before dereferencing it. This can happen when pageout() is writing pages directly without going through the usual writeback path. As pageout() path is single-threaded, we don't want it to be blocked behind a slow cgroup and ultimately want it to delegate actual writing to the usual writeback path. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/buffer.c | 4 +- fs/fs-writeback.c | 177 +++++++++++++++++++++++++++++++++++++++++++++- fs/mpage.c | 1 + include/linux/fs.h | 5 ++ include/linux/writeback.h | 16 +++++ 5 files changed, 200 insertions(+), 3 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/buffer.c b/fs/buffer.c index d883c799fb45..aca687f966d7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3040,8 +3040,10 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) + if (wbc) { wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 755e8ef8d1f0..f98d40333c85 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -214,6 +214,20 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #ifdef CONFIG_CGROUP_WRITEBACK +/* parameters for foreign inode detection, see wb_detach_inode() */ +#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ +#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ +#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ + +#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ +#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) + /* each slot's duration is 2s / 16 */ +#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) + /* if foreign slots >= 8, switch */ +#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) + /* one round can affect upto 5 slots */ + void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -258,23 +272,182 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) { wbc->wb = inode_to_wb(inode); + wbc->inode = inode; + + wbc->wb_id = wbc->wb->memcg_css->id; + wbc->wb_lcand_id = inode->i_wb_frn_winner; + wbc->wb_tcand_id = 0; + wbc->wb_bytes = 0; + wbc->wb_lcand_bytes = 0; + wbc->wb_tcand_bytes = 0; + wb_get(wbc->wb); spin_unlock(&inode->i_lock); } /** - * wbc_detach_inode - disassociate wbc from its target inode - * @wbc: writeback_control of interest + * wbc_detach_inode - disassociate wbc from inode and perform foreign detection + * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. + * + * As concurrent write sharing of an inode is expected to be very rare and + * memcg only tracks page ownership on first-use basis severely confining + * the usefulness of such sharing, cgroup writeback tracks ownership + * per-inode. While the support for concurrent write sharing of an inode + * is deemed unnecessary, an inode being written to by different cgroups at + * different points in time is a lot more common, and, more importantly, + * charging only by first-use can too readily lead to grossly incorrect + * behaviors (single foreign page can lead to gigabytes of writeback to be + * incorrectly attributed). + * + * To resolve this issue, cgroup writeback detects the majority dirtier of + * an inode and transfers the ownership to it. To avoid unnnecessary + * oscillation, the detection mechanism keeps track of history and gives + * out the switch verdict only if the foreign usage pattern is stable over + * a certain amount of time and/or writeback attempts. + * + * On each writeback attempt, @wbc tries to detect the majority writer + * using Boyer-Moore majority vote algorithm. In addition to the byte + * count from the majority voting, it also counts the bytes written for the + * current wb and the last round's winner wb (max of last round's current + * wb, the winner from two rounds ago, and the last round's majority + * candidate). Keeping track of the historical winner helps the algorithm + * to semi-reliably detect the most active writer even when it's not the + * absolute majority. + * + * Once the winner of the round is determined, whether the winner is + * foreign or not and how much IO time the round consumed is recorded in + * inode->i_wb_frn_history. If the amount of recorded foreign IO time is + * over a certain threshold, the switch verdict is given. */ void wbc_detach_inode(struct writeback_control *wbc) { + struct bdi_writeback *wb = wbc->wb; + struct inode *inode = wbc->inode; + u16 history = inode->i_wb_frn_history; + unsigned long avg_time = inode->i_wb_frn_avg_time; + unsigned long max_bytes, max_time; + int max_id; + + /* pick the winner of this round */ + if (wbc->wb_bytes >= wbc->wb_lcand_bytes && + wbc->wb_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_id; + max_bytes = wbc->wb_bytes; + } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_lcand_id; + max_bytes = wbc->wb_lcand_bytes; + } else { + max_id = wbc->wb_tcand_id; + max_bytes = wbc->wb_tcand_bytes; + } + + /* + * Calculate the amount of IO time the winner consumed and fold it + * into the running average kept per inode. If the consumed IO + * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for + * deciding whether to switch or not. This is to prevent one-off + * small dirtiers from skewing the verdict. + */ + max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, + wb->avg_write_bandwidth); + if (avg_time) + avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - + (avg_time >> WB_FRN_TIME_AVG_SHIFT); + else + avg_time = max_time; /* immediate catch up on first run */ + + if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { + int slots; + + /* + * The switch verdict is reached if foreign wb's consume + * more than a certain proportion of IO time in a + * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot + * history mask where each bit represents one sixteenth of + * the period. Determine the number of slots to shift into + * history from @max_time. + */ + slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), + (unsigned long)WB_FRN_HIST_MAX_SLOTS); + history <<= slots; + if (wbc->wb_id != max_id) + history |= (1U << slots) - 1; + + /* + * Switch if the current wb isn't the consistent winner. + * If there are multiple closely competing dirtiers, the + * inode may switch across them repeatedly over time, which + * is okay. The main goal is avoiding keeping an inode on + * the wrong wb for an extended period of time. + */ + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) { + /* switch */ + max_id = 0; + avg_time = 0; + history = 0; + } + } + + /* + * Multiple instances of this function may race to update the + * following fields but we don't mind occassional inaccuracies. + */ + inode->i_wb_frn_winner = max_id; + inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); + inode->i_wb_frn_history = history; + wb_put(wbc->wb); wbc->wb = NULL; } +/** + * wbc_account_io - account IO issued during writeback + * @wbc: writeback_control of the writeback in progress + * @page: page being written out + * @bytes: number of bytes being written out + * + * @bytes from @page are about to written out during the writeback + * controlled by @wbc. Keep the book for foreign inode detection. See + * wbc_detach_inode(). + */ +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes) +{ + int id; + + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (!wbc->wb) + return; + + rcu_read_lock(); + id = mem_cgroup_css_from_page(page)->id; + rcu_read_unlock(); + + if (id == wbc->wb_id) { + wbc->wb_bytes += bytes; + return; + } + + if (id == wbc->wb_lcand_id) + wbc->wb_lcand_bytes += bytes; + + /* Boyer-Moore majority vote algorithm */ + if (!wbc->wb_tcand_bytes) + wbc->wb_tcand_id = id; + if (id == wbc->wb_tcand_id) + wbc->wb_tcand_bytes += bytes; + else + wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); +} + /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion diff --git a/fs/mpage.c b/fs/mpage.c index 388fde6ac255..ca0244b69de8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -614,6 +614,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ + wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(WRITE, bio); diff --git a/include/linux/fs.h b/include/linux/fs.h index 67a42ec95065..740126d7c44e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -638,6 +638,11 @@ struct inode { struct list_head i_wb_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ + + /* foreign inode detection, see wbc_detach_inode() */ + int i_wb_frn_winner; + u16 i_wb_frn_avg_time; + u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f964e558af5..b333c945e571 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -88,6 +88,15 @@ struct writeback_control { unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ + struct inode *inode; /* inode being written out */ + + /* foreign inode detection, see wbc_detach_inode() */ + int wb_id; /* current wb id */ + int wb_lcand_id; /* last foreign candidate wb id */ + int wb_tcand_id; /* this foreign candidate wb id */ + size_t wb_bytes; /* bytes written by current wb */ + size_t wb_lcand_bytes; /* bytes written by last candidate */ + size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; @@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes); /** * inode_attach_wb - associate an inode with its wb @@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } +static inline void wbc_account_io(struct writeback_control *wbc, + struct page *page, size_t bytes) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* -- cgit v1.2.3