From 7e6cea5ae2f5e62112fce69acc07ee8b694b6dd0 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:52 -0800 Subject: docs: document iomap writeback's iomap_finish_folio_write() requirement Document that iomap_finish_folio_write() must be called after writeback on the range completes. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b1ac08c7474..a5032e456079 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -435,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, -- cgit v1.2.3 From 6b1fd2281fb0873ec56f8791d4e4898302070804 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:53 -0800 Subject: iomap: optimize pending async writeback accounting Pending writebacks must be accounted for to determine when all requests have completed and writeback on the folio should be ended. Currently this is done by atomically incrementing ifs->write_bytes_pending for every range to be written back. Instead, the number of atomic operations can be minimized by setting ifs->write_bytes_pending to the folio size, internally tracking how many bytes are written back asynchronously, and then after sending off all the requests, decrementing ifs->write_bytes_pending by the number of bytes not written back asynchronously. Now, for N ranges written back, only N + 2 atomic operations are required instead of 2N + 2. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fuse/file.c | 4 ++-- fs/iomap/buffered-io.c | 58 +++++++++++++++++++++++++++++--------------------- fs/iomap/ioend.c | 2 -- include/linux/iomap.h | 2 -- 4 files changed, 36 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8275b6681b9b..b343a6f37563 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1885,7 +1885,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - iomap_finish_folio_write(inode, ap->folios[i], 1); + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -2221,7 +2222,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, ap = &wpa->ia.ap; } - iomap_start_folio_write(inode, folio, 1); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0eb439b523b1..1873a2f74883 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1641,16 +1641,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len) +static void iomap_writeback_init(struct inode *inode, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + if (ifs) { + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + /* + * Set this to the folio size. After processing the folio for + * writeback in iomap_writeback_folio(), we'll subtract any + * ranges not written back. + * + * We do this because otherwise, we would have to atomically + * increment ifs->write_bytes_pending every time a range in the + * folio needs to be written back. + */ + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); + } } -EXPORT_SYMBOL_GPL(iomap_start_folio_write); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) @@ -1667,7 +1676,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write); static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, u32 rlen, u64 end_pos, - bool *wb_pending) + size_t *bytes_submitted) { do { ssize_t ret; @@ -1681,11 +1690,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, pos += ret; /* - * Holes are not be written back by ->writeback_range, so track + * Holes are not written back by ->writeback_range, so track * if we did handle anything that is not a hole here. */ if (wpc->iomap.type != IOMAP_HOLE) - *wb_pending = true; + *bytes_submitted += ret; } while (rlen); return 0; @@ -1756,7 +1765,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - bool wb_pending = false; + size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1776,14 +1785,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) iomap_set_range_dirty(folio, 0, end_pos - pos); } - /* - * Keep the I/O completion handler from clearing the writeback - * bit until we have submitted all blocks by adding a bias to - * ifs->write_bytes_pending, which is dropped after submitting - * all blocks. - */ - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - iomap_start_folio_write(inode, folio, 1); + iomap_writeback_init(inode, folio); } /* @@ -1798,13 +1800,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, - &wb_pending); + &bytes_submitted); if (error) break; pos += rlen; } - if (wb_pending) + if (bytes_submitted) wpc->nr_folios++; /* @@ -1822,12 +1824,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * bit ourselves right after unlocking the page. */ if (ifs) { - if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); - } else { - if (!wb_pending) - folio_end_writeback(folio); + /* + * Subtract any bytes that were initially accounted to + * write_bytes_pending but skipped for writeback. + */ + size_t bytes_not_submitted = folio_size(folio) - + bytes_submitted; + + if (bytes_not_submitted) + iomap_finish_folio_write(inode, folio, + bytes_not_submitted); + } else if (!bytes_submitted) { + folio_end_writeback(folio); } + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..86f44922ed3b 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -194,8 +194,6 @@ new_ioend: if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; - iomap_start_folio_write(wpc->inode, folio, map_len); - /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a5032e456079..b49e47f069db 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -478,8 +478,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); -- cgit v1.2.3 From f8eaf79406fe9415db0e7a5c175b50cb01265199 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:54 -0800 Subject: iomap: simplify ->read_folio_range() error handling for reads Instead of requiring that the caller calls iomap_finish_folio_read() even if the ->read_folio_range() callback returns an error, account for this internally in iomap instead, which makes the interface simpler and makes it match writeback's ->read_folio_range() error handling expectations. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 7 ++- fs/fuse/file.c | 10 +--- fs/iomap/buffered-io.c | 63 ++++++++++++++------------ include/linux/iomap.h | 5 +- 4 files changed, 41 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 4d30723be7fa..64f4baf5750e 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -149,10 +149,9 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: iomap calls these functions: - ``read_folio_range``: Called to read in the range. This must be provided - by the caller. The caller is responsible for calling - iomap_finish_folio_read() after reading in the folio range. This should be - done even if an error is encountered during the read. This returns 0 on - success or a negative error on failure. + by the caller. If this succeeds, iomap_finish_folio_read() must be called + after the range is read in, regardless of whether the read succeeded or + failed. - ``submit_read``: Submit any pending read requests. This function is optional. diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b343a6f37563..7bcb650a9f26 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -922,13 +922,6 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, if (ctx->rac) { ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); - /* - * If fuse_handle_readahead was successful, fuse_readpages_end - * will do the iomap_finish_folio_read, else we need to call it - * here - */ - if (ret) - iomap_finish_folio_read(folio, off, len, ret); } else { /* * for non-readahead read requests, do reads synchronously @@ -936,7 +929,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, * out-of-order reads */ ret = fuse_do_readfolio(file, folio, off, len); - iomap_finish_folio_read(folio, off, len, ret); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); } return ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1873a2f74883..c82b5b24d4b3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -398,7 +398,8 @@ static void iomap_read_init(struct folio *folio) * has already finished reading in the entire folio. */ spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += len + 1; + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + ifs->read_bytes_pending = len + 1; spin_unlock_irq(&ifs->state_lock); } } @@ -414,43 +415,47 @@ static void iomap_read_init(struct folio *folio) */ static void iomap_read_end(struct folio *folio, size_t bytes_submitted) { - struct iomap_folio_state *ifs; - - /* - * If there are no bytes submitted, this means we are responsible for - * unlocking the folio here, since no IO helper has taken ownership of - * it. - */ - if (!bytes_submitted) { - folio_unlock(folio); - return; - } + struct iomap_folio_state *ifs = folio->private; - ifs = folio->private; if (ifs) { bool end_read, uptodate; - /* - * Subtract any bytes that were initially accounted to - * read_bytes_pending but skipped for IO. - * The +1 accounts for the bias we added in iomap_read_init(). - */ - size_t bytes_not_submitted = folio_size(folio) + 1 - - bytes_submitted; spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending -= bytes_not_submitted; - /* - * If !ifs->read_bytes_pending, this means all pending reads - * by the IO helper have already completed, which means we need - * to end the folio read here. If ifs->read_bytes_pending != 0, - * the IO helper will end the folio read. - */ - end_read = !ifs->read_bytes_pending; + if (!ifs->read_bytes_pending) { + WARN_ON_ONCE(bytes_submitted); + end_read = true; + } else { + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. The +1 + * accounts for the bias we added in iomap_read_init(). + */ + size_t bytes_not_submitted = folio_size(folio) + 1 - + bytes_submitted; + ifs->read_bytes_pending -= bytes_not_submitted; + /* + * If !ifs->read_bytes_pending, this means all pending + * reads by the IO helper have already completed, which + * means we need to end the folio read here. If + * ifs->read_bytes_pending != 0, the IO helper will end + * the folio read. + */ + end_read = !ifs->read_bytes_pending; + } if (end_read) uptodate = ifs_is_fully_uptodate(folio, ifs); spin_unlock_irq(&ifs->state_lock); if (end_read) folio_end_read(folio, uptodate); + } else if (!bytes_submitted) { + /* + * If there were no bytes submitted, this means we are + * responsible for unlocking the folio here, since no IO helper + * has taken ownership of it. If there were bytes submitted, + * then the IO helper will end the read via + * iomap_finish_folio_read(). + */ + folio_unlock(folio); } } @@ -498,10 +503,10 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, } else { if (!*bytes_submitted) iomap_read_init(folio); - *bytes_submitted += plen; ret = ctx->ops->read_folio_range(iter, ctx, plen); if (ret) return ret; + *bytes_submitted += plen; } ret = iomap_iter_advance(iter, plen); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b49e47f069db..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -495,9 +495,8 @@ struct iomap_read_ops { /* * Read in a folio range. * - * The caller is responsible for calling iomap_finish_folio_read() after - * reading in the folio range. This should be done even if an error is - * encountered during the read. + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. * * Returns 0 on success or a negative error on failure. */ -- cgit v1.2.3