diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 01:00:45 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 01:00:45 +0300 |
| commit | c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b (patch) | |
| tree | 2cb320b5bc6f1c97da837e8cb43352a72e789267 | |
| parent | 0793d39ec8bab2b2255e3a288894c39e88ce5a75 (diff) | |
| parent | 0275dc184aa007b260374af6d46fb15741c062a8 (diff) | |
| download | linux-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.tar.xz | |
Merge tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs writeback updates from Christian Brauner:
- Fix a race between cgroup_writeback_umount() and inode_switch_wbs()
When a container exits, a race between cgroup_writeback_umount() and
inode_switch_wbs()/cleanup_offline_cgwb() can trigger "VFS: Busy
inodes after unmount" followed by a use-after-free on percpu
counters.
There is a window between inode_prepare_wbs_switch() returning true
(having passed the SB_ACTIVE check and grabbed the inode) and the
subsequent wb_queue_isw() call: if cgroup_writeback_umount() observes
the global isw_nr_in_flight counter as non-zero but flush_workqueue()
finds nothing queued yet, it returns early - leaving a held inode
reference that blocks evict_inodes() and a later iput() that hits
freed percpu counters.
The race is closed by covering the window from
inode_prepare_wbs_switch() through wb_queue_isw() with an RCU
read-side critical section and synchronizing in the umount path.
On top of that the now-dead rcu_barrier() left over from the
queue_rcu_work() era is removed, and the global
synchronize_rcu()/flush_workqueue() pair is replaced with a per-sb
in-flight counter plus pin/unpin/drain helpers so umount no longer
serializes against switch activity on unrelated superblocks.
Under cgroup writeback churn on a 16 vCPU guest this takes umount
latency from ~92-138ms p50 down to ~5-8ms p50 and the cumulative cost
of cgroup_writeback_umount() from ~62ms to ~4us per call.
The initial race fix is kept separate and minimal so it backports
cleanly to stable trees that still queue switches via
queue_rcu_work().
- Improve write performance with RWF_DONTCACHE
Dirty DONTCACHE pages are now tracked per bdi_writeback so that the
writeback flusher can be kicked in a targeted fashion for
IOCB_DONTCACHE writes instead of relying on global writeback, and the
PG_dropbehind flag is preserved when a folio is split.
* tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
mm: track DONTCACHE dirty pages per bdi_writeback
mm: preserve PG_dropbehind flag during folio split
writeback: use a per-sb counter to drain inode wb switches at umount
writeback: drop now-unnecessary rcu_barrier() in cgroup_writeback_umount()
writeback: fix race between cgroup_writeback_umount() and inode_switch_wbs()
| -rw-r--r-- | fs/fs-writeback.c | 138 | ||||
| -rw-r--r-- | include/linux/backing-dev-defs.h | 3 | ||||
| -rw-r--r-- | include/linux/fs.h | 6 | ||||
| -rw-r--r-- | include/linux/fs/super_types.h | 8 | ||||
| -rw-r--r-- | include/trace/events/writeback.h | 3 | ||||
| -rw-r--r-- | mm/filemap.c | 15 | ||||
| -rw-r--r-- | mm/huge_memory.c | 1 | ||||
| -rw-r--r-- | mm/page-writeback.c | 6 |
8 files changed, 147 insertions, 33 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a65694cbfe68..fdb8766d275a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -432,6 +432,10 @@ static bool inode_do_switch_wbs(struct inode *inode, long nr = folio_nr_pages(folio); wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); + if (folio_test_dropbehind(folio)) { + wb_stat_mod(old_wb, WB_DONTCACHE_DIRTY, -nr); + wb_stat_mod(new_wb, WB_DONTCACHE_DIRTY, nr); + } } } @@ -497,6 +501,23 @@ skip_switch: return switched; } +static inline void cgroup_writeback_pin(struct super_block *sb) +{ + atomic_inc(&sb->s_isw_nr_in_flight); +} + +static inline void cgroup_writeback_unpin(struct super_block *sb) +{ + if (atomic_dec_and_test(&sb->s_isw_nr_in_flight)) + wake_up_var(&sb->s_isw_nr_in_flight); +} + +static inline void cgroup_writeback_drain(struct super_block *sb) +{ + wait_var_event(&sb->s_isw_nr_in_flight, + !atomic_read(&sb->s_isw_nr_in_flight)); +} + static void process_inode_switch_wbs(struct bdi_writeback *new_wb, struct inode_switch_wbs_context *isw) { @@ -554,8 +575,12 @@ relock: wb_put_many(old_wb, nr_switched); } - for (inodep = isw->inodes; *inodep; inodep++) + for (inodep = isw->inodes; *inodep; inodep++) { + struct super_block *sb = (*inodep)->i_sb; + iput(*inodep); + cgroup_writeback_unpin(sb); + } wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); @@ -598,16 +623,19 @@ void inode_switch_wbs_work_fn(struct work_struct *work) static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { + /* Avoid the atomic_inc/smp_mb dance once SB_ACTIVE is gone. */ + if (!(inode->i_sb->s_flags & SB_ACTIVE)) + return false; + /* - * Paired with smp_mb() in cgroup_writeback_umount(). - * isw_nr_in_flight must be increased before checking SB_ACTIVE and - * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 - * in cgroup_writeback_umount() and the isw_wq will be not flushed. + * Pairs with smp_mb() in cgroup_writeback_umount(): the umounter either + * sees a non-zero counter and waits, or we see SB_ACTIVE clear below. */ + cgroup_writeback_pin(inode->i_sb); smp_mb(); if (IS_DAX(inode)) - return false; + goto out_unpin; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); @@ -615,13 +643,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode, inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); - return false; + goto out_unpin; } inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); return true; + +out_unpin: + cgroup_writeback_unpin(inode->i_sb); + return false; } static void wb_queue_isw(struct bdi_writeback *wb, @@ -1198,36 +1230,27 @@ out_bdi_put: } /** - * cgroup_writeback_umount - flush inode wb switches for umount + * cgroup_writeback_umount - wait for in-flight inode wb switches on @sb * @sb: target super_block * - * This function is called when a super_block is about to be destroyed and - * flushes in-flight inode wb switches. An inode wb switch goes through - * RCU and then workqueue, so the two need to be flushed in order to ensure - * that all previously scheduled switches are finished. As wb switches are - * rare occurrences and synchronize_rcu() can take a while, perform - * flushing iff wb switches are in flight. + * Wait until every inode wb switch that already passed the SB_ACTIVE + * check on this superblock has been completed by the worker. Since + * SB_ACTIVE is cleared before this is called, no new switches can start + * for @sb, so s_isw_nr_in_flight will monotonically drop to zero. */ void cgroup_writeback_umount(struct super_block *sb) { - if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) return; /* - * SB_ACTIVE should be reliably cleared before checking - * isw_nr_in_flight, see generic_shutdown_super(). + * Pairs with smp_mb() in inode_prepare_wbs_switch(): we either observe + * a non-zero counter and wait, or the switcher sees SB_ACTIVE clear + * (cleared by generic_shutdown_super()) and bails before grabbing the + * inode. */ smp_mb(); - - if (atomic_read(&isw_nr_in_flight)) { - /* - * Use rcu_barrier() to wait for all pending callbacks to - * ensure that all in-flight wb switches are in the workqueue. - */ - rcu_barrier(); - flush_workqueue(isw_wq); - } + cgroup_writeback_drain(sb); } static int __init cgroup_writeback_init(void) @@ -2373,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb) return nr_pages; } +static long wb_check_start_dontcache(struct bdi_writeback *wb) +{ + long nr_pages; + + if (!test_and_clear_bit(WB_start_dontcache, &wb->state)) + return 0; + + nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY); + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = WB_REASON_DONTCACHE, + }; + + nr_pages = wb_writeback(wb, &work); + } + + return nr_pages; +} /* * Retrieve work items and do the writeback they describe @@ -2395,6 +2439,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) wrote += wb_check_start_all(wb); /* + * Check for dontcache writeback request + */ + wrote += wb_check_start_dontcache(wb); + + /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); @@ -2468,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, rcu_read_unlock(); } +/** + * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes + * @mapping: address_space that was just written to + * + * Kick the writeback flusher thread to expedite writeback of dontcache dirty + * pages. Queue writeback for the inode's wb for as many pages as there are + * dontcache pages, but don't restrict writeback to dontcache pages only. + * + * This significantly improves performance over either writing all wb's pages + * or writing only dontcache pages. Although it doesn't guarantee quick + * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages + * in check. Over longer term dontcache pages get written and reclaimed by + * background writeback even with this rough heuristic. + */ +void filemap_dontcache_kick_writeback(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + bool need_wakeup = false; + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + if (wb_has_dirty_io(wb) && + !test_bit(WB_start_dontcache, &wb->state) && + !test_and_set_bit(WB_start_dontcache, &wb->state)) { + wb_get(wb); + need_wakeup = true; + } + unlocked_inode_to_wb_end(inode, &cookie); + + if (need_wakeup) { + wb_wakeup(wb); + wb_put(wb); + } +} +EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback); + /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a06b93446d10..4f1084937315 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -26,6 +26,7 @@ enum wb_state { WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ + WB_start_dontcache, /* dontcache writeback pending */ }; enum wb_stat_item { @@ -33,6 +34,7 @@ enum wb_stat_item { WB_WRITEBACK, WB_DIRTIED, WB_WRITTEN, + WB_DONTCACHE_DIRTY, NR_WB_STAT_ITEMS }; @@ -55,6 +57,7 @@ enum wb_reason { */ WB_REASON_FORKER_THREAD, WB_REASON_FOREIGN_FLUSH, + WB_REASON_DONTCACHE, WB_REASON_MAX, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index a220d14b1f91..2a6d2cb674db 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2633,6 +2633,7 @@ extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); +void filemap_dontcache_kick_writeback(struct address_space *mapping); static inline int file_write_and_wait(struct file *file) { @@ -2666,10 +2667,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - - filemap_flush_range(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping); } return count; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..1ab4e2265129 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -274,6 +274,14 @@ struct super_block { /* number of fserrors that are being sent to fsnotify/filesystems */ refcount_t s_pending_errors; + +#ifdef CONFIG_CGROUP_WRITEBACK + /* + * Number of in-flight inode wb switches for this sb. Drained by + * cgroup_writeback_umount() before tear-down. + */ + atomic_t s_isw_nr_in_flight; +#endif } __randomize_layout; /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index bdac0d685a98..13ee076ccd16 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -44,7 +44,8 @@ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ - EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") + EM( WB_REASON_FOREIGN_FLUSH, "foreign_flush") \ + EMe(WB_REASON_DONTCACHE, "dontcache") WB_WORK_REASON diff --git a/mm/filemap.c b/mm/filemap.c index 4e636647100c..179f2886f8c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2052,8 +2052,19 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); /* not an uncached lookup, clear uncached if set */ - if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) - folio_clear_dropbehind(folio); + if (!(fgp_flags & FGP_DONTCACHE) && folio_test_clear_dropbehind(folio)) { + if (folio_test_dirty(folio) && + mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + long nr = folio_nr_pages(folio); + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr); + unlocked_inode_to_wb_end(inode, &cookie); + } + } return folio; } EXPORT_SYMBOL(__filemap_get_folio_mpol); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b118bcd392cb..d29e85495091 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3644,6 +3644,7 @@ static void __split_folio_to_order(struct folio *folio, int old_order, (1L << PG_arch_3) | #endif (1L << PG_dirty) | + (1L << PG_dropbehind) | LRU_GEN_MASK | LRU_REFS_MASK)); if (handle_hwpoison && diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 833f743f309f..e98748112d1e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2626,6 +2626,8 @@ static void folio_account_dirtied(struct folio *folio, wb = inode_to_wb(inode); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + if (folio_test_dropbehind(folio)) + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); @@ -2647,6 +2649,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + if (folio_test_dropbehind(folio)) + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); task_io_account_cancelled_write(nr * PAGE_SIZE); @@ -2916,6 +2920,8 @@ bool folio_clear_dirty_for_io(struct folio *folio) if (folio_test_clear_dirty(folio)) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + if (folio_test_dropbehind(folio)) + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); ret = true; |
