From c50105933f0c75aacc4f95c9bf36f7fbd9a83884 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:39:59 +0100 Subject: iomap: allow the file system to submit the writeback bios Change ->prepare_ioend to ->submit_ioend and require file systems that implement it to submit the bio. This is needed for file systems that do their own work on the bios before submitting them to the block layer like btrfs or zoned xfs. To make this easier also pass the writeback context to the method. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..dc8df4f779d4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -362,12 +362,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where -- cgit v1.2.3 From 710273330663241d9ca5fbed51909e65807556ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:00 +0100 Subject: iomap: simplify io_flags and io_type in struct iomap_ioend The ioend fields for distinct types of I/O are a bit complicated. Consolidate them into a single io_flag field with it's own flags decoupled from the iomap flags. This also prepares for adding a new flag that is unrelated to both of the iomap namespaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-3-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++----------------- fs/xfs/xfs_aops.c | 12 ++++++------ include/linux/iomap.h | 20 ++++++++++++++++++-- 3 files changed, 46 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7952bf004bdb..d8d271107e60 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1605,13 +1605,10 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; - if (next->io_flags & IOMAP_F_BOUNDARY) + if (next->io_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; @@ -1709,7 +1706,8 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { struct iomap_ioend *ioend; struct bio *bio; @@ -1724,8 +1722,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; + ioend->io_flags = ioend_flags; if (pos > wpc->iomap.offset) wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; ioend->io_inode = inode; @@ -1737,14 +1734,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, return ioend; } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) - return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; @@ -1779,14 +1775,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index aa88895673d8..8e60ceeb1520 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -114,7 +114,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); @@ -125,9 +125,9 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); if (!error && xfs_ioend_is_append(ioend)) @@ -410,7 +410,7 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,8 +418,8 @@ xfs_submit_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; if (status) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index dc8df4f779d4..9583f6456165 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset, sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); +/* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ -- cgit v1.2.3 From 034c29fb3e7c119c42e650986e280f025a1bec7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:01 +0100 Subject: iomap: add a IOMAP_F_ANON_WRITE flag Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not have a target block assigned to it yet at iomap time and the file system will do that in the bio submission handler, splitting the I/O as needed. This is used to implement Zone Append based I/O for zoned XFS, where splitting writes to the hardware limits and assigning a zone to them happens just before sending the I/O off to the block layer, but could also be useful for other things like compressed I/O. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-4-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/design.rst | 4 ++++ fs/iomap/buffered-io.c | 13 +++++++++---- fs/iomap/direct-io.c | 6 ++++-- include/linux/iomap.h | 7 +++++++ 4 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..28ab3758c474 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -246,6 +246,10 @@ The fields are as follows: * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can be set by the filesystem for its own purposes. + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target + block assigned to it yet and the file system will do that in the bio + submission handler, splitting the I/O as needed. + These flags can be set by iomap itself during file operations. The filesystem should supply an ``->iomap_end`` function if it needs to observe these flags: diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d8d271107e60..ba795d72e546 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1691,10 +1691,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->submit_ioend) + if (wpc->ops->submit_ioend) { error = wpc->ops->submit_ioend(wpc, error); - else if (!error) - submit_bio(&wpc->ioend->io_bio); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); @@ -1744,7 +1748,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e..641649a04614 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,10 +81,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9583f6456165..eb0764945b42 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } -- cgit v1.2.3 From 5fcbd555d48390a8c819ba7fdf55fbfcabe05c80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:02 +0100 Subject: iomap: split bios to zone append limits in the submission handlers Provide helpers for file systems to split bios in the direct I/O and writeback I/O submission handlers. The split ioends are chained to the parent ioend so that only the parent ioend originally generated by the iomap layer will be processed after all the chained off children have completed. This is based on the block layer bio chaining that has supported a similar mechanism for a long time. This Follows btrfs' lead and don't try to build bios to hardware limits for zone append commands, but instead build them as normal unconstrained bios and split them to the hardware limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-5-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/Makefile | 1 + fs/iomap/buffered-io.c | 49 ++++++++++++++++------------ fs/iomap/ioend.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 15 ++++++++- 4 files changed, 130 insertions(+), 21 deletions(-) create mode 100644 fs/iomap/ioend.c (limited to 'include/linux') diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ba795d72e546..f67e13a9807a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -40,7 +40,8 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) @@ -1539,15 +1540,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * ioend after this. */ static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,6 +1567,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } +static u32 +iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + return iomap_finish_ioend_buffered(ioend); +} + /* * Ioend completion routine for merged bios. This can only be called from task * contexts as merged ioends can be of unbound length. Hence we have to break up @@ -1667,8 +1686,10 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* @@ -1713,7 +1734,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, loff_t pos, u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1721,21 +1741,10 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_flags = ioend_flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..3ff38c665c31 --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eb0764945b42..90c27875e39d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -353,12 +353,19 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, /* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend can split a bio generated + * by iomap. In that case the parent ioend it was split from is recorded + * in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ @@ -408,6 +415,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -479,4 +490,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From e523f2d4c974a819730830ce1c38834ee0cd7318 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:05 +0100 Subject: iomap: optionally use ioends for direct I/O struct iomap_ioend currently tracks outstanding buffered writes and has some really nice code in core iomap and XFS to merge contiguous I/Os an defer them to userspace for completion in a very efficient way. For zoned writes we'll also need a per-bio user context completion to record the written blocks, and the infrastructure for that would look basically like the ioend handling for buffered I/O. So instead of reinventing the wheel, reuse the existing infrastructure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-8-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- fs/iomap/internal.h | 1 + fs/iomap/ioend.c | 2 ++ include/linux/iomap.h | 10 ++++++---- 4 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 277ece243770..138d246ec29d 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include #include @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -119,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -241,6 +244,47 @@ void iomap_dio_bio_end_io(struct bio *bio) } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index 36d5c56e073e..f6992a3bf66a 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -5,5 +5,6 @@ #define IOEND_BATCH_SIZE 4096 u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 97d43c50cdf7..44f254ecab55 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -41,6 +41,8 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) if (!atomic_dec_and_test(&ioend->io_remaining)) return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); return iomap_finish_ioend_buffered(ioend); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 90c27875e39d..5768b9f2a1cc 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -343,20 +343,22 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions. * - * File systems implementing ->submit_ioend can split a bio generated - * by iomap. In that case the parent ioend it was split from is recorded - * in ioend->io_parent. + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ -- cgit v1.2.3 From d06244c60aec1d5d1589efe6b611a5b91a49465c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:06 +0100 Subject: iomap: add a io_private field to struct iomap_ioend Add a private data field to struct iomap_ioend so that the file system can attach information to it. Zoned XFS will use this for a pointer to the open zone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-9-hch@lst.de Signed-off-by: Christian Brauner --- fs/iomap/ioend.c | 1 + include/linux/iomap.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 44f254ecab55..18894ebba6db 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -23,6 +23,7 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode, ioend->io_offset = file_offset; ioend->io_size = bio->bi_iter.bi_size; ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; return ioend; } EXPORT_SYMBOL_GPL(iomap_init_ioend); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5768b9f2a1cc..b4be07e8ec94 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -370,6 +370,7 @@ struct iomap_ioend { struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; -- cgit v1.2.3 From 02b39c4655d52141e07e80e9b2772d96daf67ff6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:07 +0100 Subject: iomap: pass private data to iomap_page_mkwrite Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-10-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 +++- fs/xfs/xfs_file.c | 3 ++- fs/zonefs/file.c | 2 +- include/linux/iomap.h | 5 ++--- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4abff64998fe..8c24d8611edf 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1489,11 +1489,13 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, return length; } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345e..785f6bbf1406 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1511,7 +1511,8 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + NULL); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b4be07e8ec94..d528eb4d5cfe 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -316,9 +316,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, -- cgit v1.2.3 From c6d1b8d15450cf061648d4e36d622da9d755654a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:08 +0100 Subject: iomap: pass private data to iomap_zero_range Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-11-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/gfs2/bmap.c | 3 ++- fs/iomap/buffered-io.c | 6 ++++-- fs/xfs/xfs_iomap.c | 2 +- include/linux/iomap.h | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..366516b98b3f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8c24d8611edf..382647fda1d1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1391,13 +1391,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1465,7 +1466,8 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + NULL); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c..3410c55f544a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1497,7 +1497,7 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } int diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d528eb4d5cfe..eddf524ac749 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,7 +313,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, -- cgit v1.2.3 From ddd402bbbf669c4ada106fd2e4c799e2b5745e3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:09 +0100 Subject: iomap: pass private data to iomap_truncate_page Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-12-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 ++-- fs/xfs/xfs_iomap.c | 2 +- include/linux/iomap.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 382647fda1d1..3458f97d1b1e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1467,7 +1467,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, if (!off) return 0; return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, - NULL); + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3410c55f544a..5dd0922fe2d1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1512,5 +1512,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eddf524ac749..022d7f338c68 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -315,7 +315,7 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, -- cgit v1.2.3 From abb0ea1923a68ec8f45a7615ebad1fc87ea06da6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:44 -0500 Subject: iomap: factor out iomap length helper In preparation to support more granular iomap iter advancing, factor the pos/len values as parameters to length calculation. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-2-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 022d7f338c68..feb748eb6294 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -238,18 +238,33 @@ struct iomap_iter { int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); } /** -- cgit v1.2.3 From b51d30ff51f9c325b65c8cd66ff6590530b14041 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:49 -0500 Subject: iomap: export iomap_iter_advance() and return remaining length As a final step for generic iter advance, export the helper and update it to return the remaining length of the current iteration after the advance. This will usually be 0 in the iomap_iter() case, but will be useful for the various operations that iterate on their own and will be updated to advance as they progress. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-7-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 22 ++++++++-------------- include/linux/iomap.h | 1 + 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 8e0746ad80bd..544cd7a5a16b 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -15,22 +15,16 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) } /* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. count = 0). Hence the "finished - * iterating" case needs to distinguish between (count = 0) meaning we are done - * and (count = 0 && stale) meaning we need to remap the entire remaining range. + * Advance the current iterator position and output the length remaining for the + * current mapping. */ -static inline int iomap_iter_advance(struct iomap_iter *iter, s64 count) +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) { - if (WARN_ON_ONCE(count > iomap_length(iter))) + if (WARN_ON_ONCE(*count > iomap_length(iter))) return -EIO; - iter->pos += count; - iter->len -= count; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); return 0; } @@ -93,7 +87,7 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) * advanced at all (i.e. no work was done for some reason) unless the * mapping has been marked stale and needs to be reprocessed. */ - ret = iomap_iter_advance(iter, processed); + ret = iomap_iter_advance(iter, &processed); if (!ret && iter->len > 0) ret = 1; if (ret > 0 && !iter->processed && !stale) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index feb748eb6294..eed06ffdcfbd 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -236,6 +236,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** * iomap_length_trim - trimmed length of the current iomap iteration -- cgit v1.2.3 From bc264fea0f6f230e56f876cc4266b1982d20f35d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:50 -0500 Subject: iomap: support incremental iomap_iter advances The current iomap_iter iteration model reads the mapping from the filesystem, processes the subrange of the operation associated with the current mapping, and returns the number of bytes processed back to the iteration code. The latter advances the position and remaining length of the iter in preparation for the next iteration. At the _iter() handler level, this tends to produce a processing loop where the local code pulls the current position and remaining length out of the iter, iterates it locally based on file offset, and then breaks out when the associated range has been fully processed. This works well enough for current handlers, but upcoming enhancements require a bit more flexibility in certain situations. Enhancements for zero range will lead to a situation where the processing loop is no longer a pure ascending offset walk, but rather dictated by pagecache state and folio lookup. Since folio lookup and write preparation occur at different levels, it is more difficult to manage position and length outside of the iter. To provide more flexibility to certain iomap operations, introduce support for incremental iomap_iter advances from within the operation itself. This allows more granular advances for operations that might not use the typical file offset based walk. Note that the semantics for operations that use incremental advances is slightly different than traditional operations. Operations that advance the iter directly are expected to return success or failure (i.e. 0 or negative error code) in iter.processed rather than the number of bytes processed. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-8-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 32 +++++++++++++++++++++++++------- include/linux/iomap.h | 8 ++++++-- 2 files changed, 31 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 544cd7a5a16b..0ebcabc7df52 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -35,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -58,6 +60,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced = iter->processed > 0 ? iter->processed : 0; + u64 olen = iter->len; s64 processed; int ret; @@ -66,11 +70,22 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) if (!iter->iomap.length) goto begin; + /* + * If iter.processed is zero, the op may still have advanced the iter + * itself. Calculate the advanced and original length bytes based on how + * far pos has advanced for ->iomap_end(). + */ + if (!advanced) { + advanced = iter->pos - iter->iter_start_pos; + olen += advanced; + } + if (ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } @@ -81,8 +96,11 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) } /* - * Advance the iter and clear state from the previous iteration. Use - * iter->len to determine whether to continue onto the next mapping. + * Advance the iter and clear state from the previous iteration. This + * passes iter->processed because that reflects the bytes processed but + * not yet advanced by the iter handler. + * + * Use iter->len to determine whether to continue onto the next mapping. * Explicitly terminate in the case where the current iter has not * advanced at all (i.e. no work was done for some reason) unless the * mapping has been marked stale and needs to be reprocessed. @@ -90,7 +108,7 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) ret = iomap_iter_advance(iter, &processed); if (!ret && iter->len > 0) ret = 1; - if (ret > 0 && !iter->processed && !stale) + if (ret > 0 && !advanced && !stale) ret = 0; iomap_iter_reset_iomap(iter); if (ret <= 0) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eed06ffdcfbd..e180dacf434c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -218,8 +218,11 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @processed: The number of bytes the most recent iteration needs iomap_iter() + * to advance the iter, zero if the iter was already advanced, or a + * negative errno for an error during the operation. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -228,6 +231,7 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; + loff_t iter_start_pos; s64 processed; unsigned flags; struct iomap iomap; -- cgit v1.2.3 From edd3e3b7d210747dec723edd2b6cb49d140c1256 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Feb 2025 09:47:56 -0500 Subject: iomap: rename iomap_iter processed field to status The iter.processed field name is no longer appropriate now that iomap operations do not return the number of bytes processed. Rename the field to iter.status to reflect that a success or error code is expected. Also change the type to int as there is no longer a need for an s64. This reduces the size of iomap_iter by 8 bytes due to a combination of smaller type and reduction in structure padding. While here, fix up the return types of various _iter() helpers to reflect the type change. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250224144757.237706-12-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/dax.c | 20 ++++++++++---------- fs/iomap/buffered-io.c | 42 +++++++++++++++++++++--------------------- fs/iomap/direct-io.c | 2 +- fs/iomap/fiemap.c | 6 +++--- fs/iomap/iter.c | 12 ++++++------ fs/iomap/seek.c | 8 ++++---- fs/iomap/swapfile.c | 4 ++-- fs/iomap/trace.h | 8 ++++---- include/linux/iomap.h | 7 +++---- 9 files changed, 54 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/fs/dax.c b/fs/dax.c index 44701865ca94..cab3c5abe5cb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static s64 dax_unshare_iter(struct iomap_iter *iter) +static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -1328,7 +1328,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_unshare_iter(&iter); + iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); @@ -1356,12 +1356,12 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) return ret; } -static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); u64 length = iomap_length(iter); - s64 ret; + int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) @@ -1416,7 +1416,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_zero_iter(&iter, did_zero); + iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); @@ -1588,7 +1588,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) - iomi.processed = dax_iomap_iter(&iomi, iter); + iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; @@ -1759,7 +1759,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { - iter.processed = -EIO; /* fs corruption? */ + iter.status = -EIO; /* fs corruption? */ continue; } @@ -1773,7 +1773,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (!(ret & VM_FAULT_ERROR)) { u64 length = PAGE_SIZE; - iter.processed = iomap_iter_advance(&iter, &length); + iter.status = iomap_iter_advance(&iter, &length); } } @@ -1889,7 +1889,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); if (ret != VM_FAULT_FALLBACK) { u64 length = PMD_SIZE; - iter.processed = iomap_iter_advance(&iter, &length); + iter.status = iomap_iter_advance(&iter, &length); } } @@ -2079,7 +2079,7 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, min(src_iter.len, dst_iter.len), same); if (status < 0) return ret; - src_iter.processed = dst_iter.processed = status; + src_iter.status = dst_iter.status = status; } return ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1518acbf8b09..cdb0fedcf3d2 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -362,7 +362,7 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(struct iomap_iter *iter, +static int iomap_readpage_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; @@ -437,10 +437,10 @@ done: return iomap_iter_advance(iter, &length); } -static loff_t iomap_read_folio_iter(struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t ret; + int ret; while (iomap_length(iter)) { ret = iomap_readpage_iter(iter, ctx); @@ -466,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -485,10 +485,10 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_read_folio); -static loff_t iomap_readahead_iter(struct iomap_iter *iter, +static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t ret; + int ret; while (iomap_length(iter)) { if (ctx->cur_folio && @@ -538,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -898,10 +898,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { ssize_t total_written = 0; - long status = 0; + int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; @@ -1021,7 +1021,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, iter.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1255,7 +1255,7 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; u64 bytes = iomap_length(iter); @@ -1315,7 +1315,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1334,7 +1334,7 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { u64 bytes = iomap_length(iter); int status; @@ -1408,7 +1408,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1427,20 +1427,20 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - s64 proc; + s64 status; if (range_dirty) { range_dirty = false; - proc = iomap_zero_iter_flush_and_stale(&iter); + status = iomap_zero_iter_flush_and_stale(&iter); } else { u64 length = iomap_length(&iter); - proc = iomap_iter_advance(&iter, &length); + status = iomap_iter_advance(&iter, &length); } - iter.processed = proc; + iter.status = status; continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); } return ret; } @@ -1461,7 +1461,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, +static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); @@ -1499,7 +1499,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + iter.status = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 8ebd5b3019a7..e1e32e2bb0bf 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -742,7 +742,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { - iomi.processed = iomap_dio_iter(&iomi, dio); + iomi.status = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 8a0d8b034218..6776b800bde7 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -39,7 +39,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t iomap_fiemap_iter(struct iomap_iter *iter, +static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { u64 length = iomap_length(iter); @@ -78,7 +78,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_fiemap_iter(&iter, fi, &prev); + iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); @@ -114,7 +114,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; - /* leave iter.processed unset to abort loop */ + /* leave iter.status unset to abort loop */ } if (ret) return 0; diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index e4dfe64029cc..6ffc6a7b9ba5 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -9,7 +9,7 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - iter->processed = 0; + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); } @@ -54,7 +54,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out - * of the loop body: leave @iter.processed unchanged, or set it to a negative + * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) @@ -86,8 +86,8 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) } /* detect old return semantics where this would advance */ - if (WARN_ON_ONCE(iter->processed > 0)) - iter->processed = -EIO; + if (WARN_ON_ONCE(iter->status > 0)) + iter->status = -EIO; /* * Use iter->len to determine whether to continue onto the next mapping. @@ -95,8 +95,8 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) * advanced at all (i.e. no work was done for some reason) unless the * mapping has been marked stale and needs to be reprocessed. */ - if (iter->processed < 0) - ret = iter->processed; + if (iter->status < 0) + ret = iter->status; else if (iter->len == 0 || (!advanced && !stale)) ret = 0; else diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 83c687d6ccc0..04d7919636c1 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,7 +10,7 @@ #include #include -static loff_t iomap_seek_hole_iter(struct iomap_iter *iter, +static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_hole_iter(&iter, &pos); + iter.status = iomap_seek_hole_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found hole before EOF */ @@ -56,7 +56,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t iomap_seek_data_iter(struct iomap_iter *iter, +static int iomap_seek_data_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_data_iter(&iter, &pos); + iter.status = iomap_seek_data_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found data before EOF */ diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 4395e46a4dc7..9ea185e58ca7 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_iter(struct iomap_iter *iter, +static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { u64 length = iomap_length(iter); @@ -169,7 +169,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); if (ret < 0) return ret; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4118a42cdab0..9eab2c8ac3c5 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) - __field(s64, processed) + __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); - __entry->processed = iter->processed; + __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, - __entry->processed, + __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e180dacf434c..af9e51fba5f0 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -220,9 +220,8 @@ struct iomap_ops { * It is updated at the same time as @pos. * @iter_start_pos: The original start pos for the current iomap. Used for * incremental iter advance. - * @processed: The number of bytes the most recent iteration needs iomap_iter() - * to advance the iter, zero if the iter was already advanced, or a - * negative errno for an error during the operation. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -232,7 +231,7 @@ struct iomap_iter { loff_t pos; u64 len; loff_t iter_start_pos; - s64 processed; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; -- cgit v1.2.3 From d79c9cc512973ef6583c3bfc0b343f9d312d85b3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Feb 2025 09:47:57 -0500 Subject: iomap: introduce a full map advance helper Various iomap_iter_advance() calls advance by the full mapping length and thus have no need for the current length input or post-advance remaining length output from the standard advance function. Add an iomap_iter_advance_full() helper to clean up these cases. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250224144757.237706-13-bfoster@redhat.com Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/dax.c | 10 ++++------ fs/iomap/buffered-io.c | 3 +-- fs/iomap/fiemap.c | 3 +-- fs/iomap/swapfile.c | 4 +--- include/linux/iomap.h | 10 ++++++++++ 5 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/dax.c b/fs/dax.c index cab3c5abe5cb..7fd4cd9a51f2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1266,11 +1266,11 @@ static int dax_unshare_iter(struct iomap_iter *iter) u64 copy_len = iomap_length(iter); u32 mod; int id = 0; - s64 ret = iomap_length(iter); + s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return iomap_iter_advance(iter, &ret); + return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because @@ -1300,16 +1300,14 @@ static int dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) - ret = iomap_length(iter); - else + if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); if (ret < 0) return dax_mem2blk_err(ret); - return iomap_iter_advance(iter, &ret); + return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index cdb0fedcf3d2..ea5e32d810d5 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1433,8 +1433,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, range_dirty = false; status = iomap_zero_iter_flush_and_stale(&iter); } else { - u64 length = iomap_length(&iter); - status = iomap_iter_advance(&iter, &length); + status = iomap_iter_advance_full(&iter); } iter.status = status; continue; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 6776b800bde7..80675c42e94e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -42,7 +42,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { - u64 length = iomap_length(iter); int ret; if (iter->iomap.type == IOMAP_HOLE) @@ -56,7 +55,7 @@ static int iomap_fiemap_iter(struct iomap_iter *iter, return 0; advance: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 9ea185e58ca7..c1a762c10ce4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -97,8 +97,6 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { - u64 length = iomap_length(iter); - switch (iomap->type) { case IOMAP_MAPPED: case IOMAP_UNWRITTEN: @@ -135,7 +133,7 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return iomap_iter_advance(iter, &length); + return iomap_iter_advance_full(iter); } /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index af9e51fba5f0..1fd66bc29cc1 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -271,6 +271,16 @@ static inline u64 iomap_length(const struct iomap_iter *iter) return iomap_length_trim(iter, iter->pos, iter->len); } +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); +} + /** * iomap_iter_srcmap - return the source map for the current iomap iteration * @i: iteration structure -- cgit v1.2.3 From b2cd5ae693a3dc5b70a0f75fba96452c591a2047 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Feb 2025 11:39:59 -0700 Subject: iomap: make buffered writes work with RWF_DONTCACHE Add iomap buffered write support for RWF_DONTCACHE. If RWF_DONTCACHE is set for a write, mark the folios being written as uncached. Then writeback completion will drop the pages. The write_iter handler simply kicks off writeback for the pages, and writeback completion will take care of the rest. Signed-off-by: "Darrick J. Wong" Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250204184047.356762-2-axboe@kernel.dk Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/design.rst | 5 +++++ Documentation/filesystems/iomap/operations.rst | 2 ++ fs/iomap/buffered-io.c | 4 ++++ include/linux/iomap.h | 1 + 4 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..7b91546750f5 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -352,6 +352,11 @@ operations: ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or ``RWF_NOWAIT``. + * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a + buffered file I/O and would like the kernel to drop the pagecache + after the I/O completes, if it isn't already being used by another + thread. + If it is necessary to read existing file contents from a `different `_ device or address range on a device, the filesystem should return that diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b0..584ff549f9a6 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. + Internal per-Folio State ------------------------ diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..ea863c3cf510 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -603,6 +603,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -1034,6 +1036,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..26b0dbe23e62 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -183,6 +183,7 @@ struct iomap_folio_ops { #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ #define IOMAP_ATOMIC (1 << 9) +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* -- cgit v1.2.3 From b4de0e9be963b95c46c4a5426e94059923d236d6 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:10 +0000 Subject: iomap: Rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW In future xfs will support a SW-based atomic write, so rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW to be clear which mode is being used. Also relocate setting of IOMAP_ATOMIC_HW to the write path in __iomap_dio_rw(), to be clear that this flag is only relevant to writes. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-3-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 4 ++-- fs/ext4/inode.c | 2 +- fs/iomap/direct-io.c | 18 +++++++++--------- fs/iomap/trace.h | 2 +- include/linux/iomap.h | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index d1535109587a..0b9d7be23bce 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -514,8 +514,8 @@ IOMAP_WRITE`` with any combination of the following enhancements: if the mapping is unwritten and the filesystem cannot handle zeroing the unaligned regions without exposing stale contents. - * ``IOMAP_ATOMIC``: This write is being issued with torn-write - protection. + * ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write + protection based on HW-offload support. Only a single bio can be created for the write, and the write must not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be set. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..ba2f1e3db7c7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3467,7 +3467,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) return false; /* atomic writes are all-or-nothing */ - if (flags & IOMAP_ATOMIC) + if (flags & IOMAP_ATOMIC_HW) return false; /* can only try again if we wrote nothing */ diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e1e32e2bb0bf..c696ce980796 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -317,7 +317,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua, bool atomic) + const struct iomap *iomap, bool use_fua, bool atomic_hw) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -329,7 +329,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - if (atomic) + if (atomic_hw) opflags |= REQ_ATOMIC; return opflags; @@ -340,8 +340,8 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; + bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW; const loff_t length = iomap_length(iter); - bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -351,7 +351,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) u64 copied = 0; size_t orig_count; - if (atomic && length != fs_block_size) + if (atomic_hw && length != fs_block_size) return -EINVAL; if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || @@ -428,7 +428,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) goto out; } - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -461,7 +461,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) } n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE(atomic && n != length)) { + if (WARN_ON_ONCE(atomic_hw && n != length)) { /* * This bio should have covered the complete length, * which it doesn't, so error. We may need to zero out @@ -652,9 +652,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iocb->ki_flags & IOCB_ATOMIC) - iomi.flags |= IOMAP_ATOMIC; - if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -689,6 +686,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC_HW; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 9eab2c8ac3c5..69af89044ebd 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" }, \ - { IOMAP_ATOMIC, "ATOMIC" } + { IOMAP_ATOMIC_HW, "ATOMIC_HW" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index ea29388b2fba..87cd7079aaf3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,7 +189,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) #define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { -- cgit v1.2.3 From 794ca29dcc924cd3f16d12b6fba61074c992b8fd Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:13 +0000 Subject: iomap: Support SW-based atomic writes Currently atomic write support requires dedicated HW support. This imposes a restriction on the filesystem that disk blocks need to be aligned and contiguously mapped to FS blocks to issue atomic writes. XFS has no method to guarantee FS block alignment for regular, non-RT files. As such, atomic writes are currently limited to 1x FS block there. To deal with the scenario that we are issuing an atomic write over misaligned or discontiguous data blocks - and raise the atomic write size limit - support a SW-based software emulated atomic write mode. For XFS, this SW-based atomic writes would use CoW support to issue emulated untorn writes. It is the responsibility of the FS to detect discontiguous atomic writes and switch to IOMAP_DIO_ATOMIC_SW mode and retry the write. Indeed, SW-based atomic writes could be used always when the mounted bdev does not support HW offload, but this strategy is not initially expected to be used. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-6-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 16 ++++++++++++++-- fs/iomap/direct-io.c | 4 +++- include/linux/iomap.h | 8 +++++++- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 0b9d7be23bce..b08a79d11d9f 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -526,8 +526,20 @@ IOMAP_WRITE`` with any combination of the following enhancements: conversion or copy on write), all updates for the entire file range must be committed atomically as well. Only one space mapping is allowed per untorn write. - Untorn writes must be aligned to, and must not be longer than, a - single file block. + Untorn writes may be longer than a single file block. In all cases, + the mapping start disk block must have at least the same alignment as + the write offset. + + * ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write + protection via a software mechanism provided by the filesystem. + All the disk block alignment and single bio restrictions which apply + to IOMAP_ATOMIC_HW do not apply here. + SW-based untorn writes would typically be used as a fallback when + HW-based untorn writes may not be issued, e.g. the range of the write + covers multiple extents, meaning that it is not possible to issue + a single bio. + All filesystem metadata updates for the entire file range must be + committed atomically as well. Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c696ce980796..c594f2cf3ab4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -686,7 +686,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } - if (iocb->ki_flags & IOCB_ATOMIC) + if (dio_flags & IOMAP_DIO_ATOMIC_SW) + iomi.flags |= IOMAP_ATOMIC_SW; + else if (iocb->ki_flags & IOCB_ATOMIC) iomi.flags |= IOMAP_ATOMIC_HW; /* for data sync or sync, we need sync completion processing */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 87cd7079aaf3..9cd93530013c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,8 +189,9 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC_HW (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) +#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -502,6 +503,11 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Use software-based torn-write protection. + */ +#define IOMAP_DIO_ATOMIC_SW (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3