summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c391
1 files changed, 324 insertions, 67 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 559a3a577097..1ee4f835ac3c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -19,6 +19,9 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -76,6 +79,26 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
+static void
+xfs_ioend_put_open_zones(
+ struct iomap_ioend *ioend)
+{
+ struct iomap_ioend *tmp;
+
+ /*
+ * Put the open zone for all ioends merged into this one (if any).
+ */
+ list_for_each_entry(tmp, &ioend->io_list, io_list)
+ xfs_open_zone_put(tmp->io_private);
+
+ /*
+ * The main ioend might not have an open zone if the submission failed
+ * before xfs_zone_alloc_and_submit got called.
+ */
+ if (ioend->io_private)
+ xfs_open_zone_put(ioend->io_private);
+}
+
/*
* IO write completion.
*/
@@ -85,6 +108,7 @@ xfs_end_ioend(
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -114,10 +138,11 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
+ ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
- offset + size);
+ offset + size, NULL);
}
goto done;
}
@@ -125,14 +150,21 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (is_zoned)
+ error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+ ioend->io_private, NULLFSBLOCK);
+ else if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- if (!error && xfs_ioend_is_append(ioend))
- error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+ if (!error &&
+ !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
+ xfs_ioend_is_append(ioend))
+ error = xfs_setfilesize(ip, offset, size);
done:
+ if (is_zoned)
+ xfs_ioend_put_open_zones(ioend);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
@@ -175,23 +207,74 @@ xfs_end_io(
}
}
-STATIC void
+void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
+ /*
+ * For Appends record the actually written block number and set the
+ * boundary flag if needed.
+ */
+ if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ xfs_mark_rtg_boundary(ioend);
+ }
+
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
/*
+ * We cannot cancel the ioend directly on error. We may have already set other
+ * pages under writeback and hence we have to run I/O completion to mark the
+ * error state of the pages under writeback appropriately.
+ *
+ * If the folio has delalloc blocks on it, the caller is asking us to punch them
+ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
+ * page that needs to be dirtied again before the delalloc mapping can be
+ * converted. This stale delalloc mapping can trip up a later direct I/O read
+ * operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why
+ * we see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
+{
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_shutdown(mp))
+ return;
+
+ xfs_alert_ratelimited(mp,
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
+ /*
+ * The end of the punch range is always the offset of the first
+ * byte of the next folio. Hence the end offset is only dependent on the
+ * folio itself and not the start offset that is passed in.
+ */
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
+ folio_pos(folio) + folio_size(folio), NULL);
+}
+
+/*
* Fast revalidation of the cached writeback mapping. Return true if the current
* mapping is valid, false otherwise.
*/
@@ -236,13 +319,12 @@ xfs_imap_valid(
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
- ssize_t count = i_blocksize(inode);
+ ssize_t count = i_blocksize(wpc->inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb;
@@ -394,76 +476,194 @@ allocate_blocks:
return 0;
}
+static ssize_t
+xfs_writeback_range(
+ struct iomap_writepage_ctx *wpc,
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
+{
+ ssize_t ret;
+
+ ret = xfs_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
+static bool
+xfs_ioend_needs_wq_completion(
+ struct iomap_ioend *ioend)
+{
+ /* Changing inode size requires a transaction. */
+ if (xfs_ioend_is_append(ioend))
+ return true;
+
+ /* Extent manipulation requires a transaction. */
+ if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
+ return true;
+
+ /* Page cache invalidation cannot be done in irq context. */
+ if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
+ return true;
+
+ return false;
+}
+
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
- int status)
+xfs_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
{
- unsigned int nofs_flag;
+ struct iomap_ioend *ioend = wpc->wb_ctx;
/*
- * We can allocate memory here while doing writeback on behalf of
- * memory reclaim. To avoid memory allocation deadlocks set the
- * task-wide nofs context for the following operations.
+ * Convert CoW extents to regular.
+ *
+ * We can allocate memory here while doing writeback on behalf of memory
+ * reclaim. To avoid memory allocation deadlocks, set the task-wide
+ * nofs context.
*/
- nofs_flag = memalloc_nofs_save();
+ if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
+ unsigned int nofs_flag;
- /* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
- status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+ nofs_flag = memalloc_nofs_save();
+ error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
+ memalloc_nofs_restore(nofs_flag);
}
- memalloc_nofs_restore(nofs_flag);
-
- /* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ /*
+ * Send ioends that might require a transaction to the completion wq.
+ */
+ if (xfs_ioend_needs_wq_completion(ioend))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ return iomap_ioend_writeback_submit(wpc, error);
}
-/*
- * If the folio has delalloc blocks on it, the caller is asking us to punch them
- * out. If we don't, we can leave a stale delalloc mapping covered by a clean
- * page that needs to be dirtied again before the delalloc mapping can be
- * converted. This stale delalloc mapping can trip up a later direct I/O read
- * operation on the same region.
- *
- * We prevent this by truncating away the delalloc regions on the folio. Because
- * they are delalloc, we can do this without needing a transaction. Indeed - if
- * we get ENOSPC errors, we have to be able to do this truncation without a
- * transaction as there is no space left for block reservation (typically why
- * we see a ENOSPC in writeback).
- */
-static void
-xfs_discard_folio(
- struct folio *folio,
- loff_t pos)
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+ .writeback_range = xfs_writeback_range,
+ .writeback_submit = xfs_writeback_submit,
+};
+
+struct xfs_zoned_writepage_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct xfs_open_zone *open_zone;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
{
- struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+ struct iomap_writepage_ctx *wpc,
+ loff_t offset,
+ unsigned int len)
+{
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_filblks_t count_fsb;
+ struct xfs_bmbt_irec imap, del;
+ struct xfs_iext_cursor icur;
if (xfs_is_shutdown(mp))
- return;
+ return -EIO;
- xfs_alert_ratelimited(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
- folio, ip->i_ino, pos);
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
/*
- * The end of the punch range is always the offset of the first
- * byte of the next folio. Hence the end offset is only dependent on the
- * folio itself and not the start offset that is passed in.
+ * All dirty data must be covered by delalloc extents. But truncate can
+ * remove delalloc extents underneath us or reduce their size.
+ * Returning a hole tells iomap to not write back any data from this
+ * range, which is the right thing to do in that case.
+ *
+ * Otherwise just tell iomap to treat ranges previously covered by a
+ * delalloc extent as mapped. The actual block allocation will be done
+ * just before submitting the bio.
+ *
+ * This implies we never map outside folios that are locked or marked
+ * as under writeback, and thus there is no need check the fork sequence
+ * count here.
*/
- xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio));
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+ imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ if (imap.br_startoff > offset_fsb) {
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+ return 0;
+ }
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ count_fsb = end_fsb - offset_fsb;
+
+ del = imap;
+ xfs_trim_extent(&del, offset_fsb, count_fsb);
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+ XFS_BMAPI_REMAP);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ wpc->iomap.type = IOMAP_MAPPED;
+ wpc->iomap.flags = IOMAP_F_DIRTY;
+ wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+ wpc->iomap.offset = offset;
+ wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+ wpc->iomap.flags = IOMAP_F_ANON_WRITE;
+
+ trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+ return 0;
}
-static const struct iomap_writeback_ops xfs_writeback_ops = {
- .map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
- .discard_folio = xfs_discard_folio,
+static ssize_t
+xfs_zoned_writeback_range(
+ struct iomap_writepage_ctx *wpc,
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
+{
+ ssize_t ret;
+
+ ret = xfs_zoned_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
+static int
+xfs_zoned_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (error) {
+ ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&ioend->io_bio);
+ return error;
+ }
+ xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
+ return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+ .writeback_range = xfs_zoned_writeback_range,
+ .writeback_submit = xfs_zoned_writeback_submit,
};
STATIC int
@@ -471,10 +671,35 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = { };
+ struct xfs_inode *ip = XFS_I(mapping->host);
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+ if (xfs_is_zoned_inode(ip)) {
+ struct xfs_zoned_writepage_ctx xc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_zoned_writeback_ops
+ },
+ };
+ int error;
+
+ error = iomap_writepages(&xc.ctx);
+ if (xc.open_zone)
+ xfs_open_zone_put(xc.open_zone);
+ return error;
+ } else {
+ struct xfs_writepage_ctx wpc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_writeback_ops
+ },
+ };
+
+ return iomap_writepages(&wpc.ctx);
+ }
}
STATIC int
@@ -528,12 +753,44 @@ xfs_vm_readahead(
}
static int
-xfs_iomap_swapfile_activate(
+xfs_vm_swap_activate(
struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+
+ /*
+ * Swap file activation can race against concurrent shared extent
+ * removal in files that have been cloned. If this happens,
+ * iomap_swapfile_iter() can fail because it encountered a shared
+ * extent even though an operation is in progress to remove those
+ * shared extents.
+ *
+ * This race becomes problematic when we defer extent removal
+ * operations beyond the end of a syscall (i.e. use async background
+ * processing algorithms). Users think the extents are no longer
+ * shared, but iomap_swapfile_iter() still sees them as shared
+ * because the refcountbt entries for the extents being removed have
+ * not yet been updated. Hence the swapon call fails unexpectedly.
+ *
+ * The race condition is currently most obvious from the unlink()
+ * operation as extent removal is deferred until after the last
+ * reference to the inode goes away. We then process the extent
+ * removal asynchronously, hence triggers the "syscall completed but
+ * work not done" condition mentioned above. To close this race
+ * window, we need to flush any pending inodegc operations to ensure
+ * they have updated the refcountbt records before we try to map the
+ * swapfile.
+ */
+ xfs_inodegc_flush(ip->i_mount);
+
+ /*
+ * Direct the swap code to the correct block device when this file
+ * sits on the RT device.
+ */
+ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
return iomap_swapfile_activate(sis, swap_file, span,
&xfs_read_iomap_ops);
}
@@ -549,11 +806,11 @@ const struct address_space_operations xfs_address_space_operations = {
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.dirty_folio = noop_dirty_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};