From 9a8dbdadae509e5717ff6e5aa572ca0974d2101d Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:06 +0000 Subject: block/fs: Pass an iocb to generic_atomic_write_valid() Darrick and Hannes both thought it better that generic_atomic_write_valid() should be passed a struct iocb, and not just the member of that struct which is referenced; see [0] and [1]. I think that makes a more generic and clean API, so make that change. [0] https://lore.kernel.org/linux-block/680ce641-729b-4150-b875-531a98657682@suse.de/ [1] https://lore.kernel.org/linux-xfs/20240620212401.GA3058325@frogsfrogsfrogs/ Fixes: c34fc6f26ab8 ("fs: Initial atomic write support") Suggested-by: Darrick J. Wong Suggested-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..fbfa032d1d90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); +bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From c3be7ebbbce5201e151f17e28a6c807602f369c9 Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:07 +0000 Subject: fs/block: Check for IOCB_DIRECT in generic_atomic_write_valid() Currently FMODE_CAN_ATOMIC_WRITE is set if the bdev can atomic write and the file is open for direct IO. This does not work if the file is not opened for direct IO, yet fcntl(O_DIRECT) is used on the fd later. Change to check for direct IO on a per-IO basis in generic_atomic_write_valid(). Since we want to report -EOPNOTSUPP for non-direct IO for an atomic write, change to return an error code. Relocate the block fops atomic write checks to the common write path, as to catch non-direct IO. Fixes: c34fc6f26ab8 ("fs: Initial atomic write support") Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/fops.c | 18 ++++++++++-------- fs/read_write.c | 13 ++++++++----- include/linux/fs.h | 2 +- 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/fops.c b/block/fops.c index 968b47b615c4..2d01c9007681 100644 --- a/block/fops.c +++ b/block/fops.c @@ -36,11 +36,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) } static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, - struct iov_iter *iter, bool is_atomic) + struct iov_iter *iter) { - if (is_atomic && !generic_atomic_write_valid(iocb, iter)) - return true; - return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_invalid(bdev, iocb, iter, is_atomic)) + if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); @@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); - } else if (is_atomic) { + } else if (iocb->ki_flags & IOCB_ATOMIC) { return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); @@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; - if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); @@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_ATOMIC) { + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; diff --git a/fs/read_write.c b/fs/read_write.c index 2c3263530828..befec0b5c537 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1830,18 +1830,21 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) - return false; + return -EINVAL; if (!is_power_of_2(len)) - return false; + return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) - return false; + return -EINVAL; - return true; + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; + + return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fbfa032d1d90..ba47fb283730 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From 1eadb157947163ca72ba8963b915fdc099ce6cca Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:08 +0000 Subject: block: Add bdev atomic write limits helpers Add helpers to get atomic write limits for a bdev, so that we don't access request_queue helpers outside the block layer. We check if the bdev can actually atomic write in these helpers, so we can avoid users missing using this check. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50c3b959da28..c2cc3c146d74 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev) return true; } +static inline unsigned int +bdev_atomic_write_unit_min_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); +} + +static inline unsigned int +bdev_atomic_write_unit_max_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 9e0933c21c128d6d8ac4d8aae0babaf9a43100b8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:02 -0800 Subject: fs: iomap: Atomic write support Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC flag set. Initially FSes (XFS) should only support writing a single FS block atomically. As with any atomic write, we should produce a single bio which covers the complete write length. Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Reviewed-by: Ritesh Harjani (IBM) [djwong: clarify a couple of things in the docs] Signed-off-by: Darrick J. Wong --- Documentation/filesystems/iomap/operations.rst | 15 ++++++++++ fs/iomap/direct-io.c | 38 +++++++++++++++++++++++--- fs/iomap/trace.h | 3 +- include/linux/iomap.h | 1 + 4 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 8e6c721d2330..ee790f843cfa 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements: if the mapping is unwritten and the filesystem cannot handle zeroing the unaligned regions without exposing stale contents. + * ``IOMAP_ATOMIC``: This write is being issued with torn-write + protection. + Only a single bio can be created for the write, and the write must + not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be + set. + The file range to write must be aligned to satisfy the requirements + of both the filesystem and the underlying block device's atomic + commit capabilities. + If filesystem metadata updates are required (e.g. unwritten extent + conversion or copy on write), all updates for the entire file range + must be committed atomically as well. + Only one space mapping is allowed per untorn write. + Untorn writes must be aligned to, and must not be longer than, a + single file block. + Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f637aa0706a3..ed4764e3b8f0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic) + opflags |= REQ_ATOMIC; return opflags; } @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); + bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; + if (atomic && length != fs_block_size) + return -EINVAL; + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; @@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * can set up the page vector appropriately for a ZONE_APPEND * operation. */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + if (WARN_ON_ONCE(atomic && n != length)) { + /* + * This bio should have covered the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -659,7 +679,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 0a991c4ce87d..4118a42cdab0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ad12a3c8bae..c7644bdcfca3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,6 +178,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ +#define IOMAP_ATOMIC (1 << 9) struct iomap_ops { /* -- cgit v1.2.3