summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c43
1 files changed, 38 insertions, 5 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 559a3a577097..6d9965b546cb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -19,6 +19,7 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_icache.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -131,7 +132,7 @@ xfs_end_ioend(
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
- error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+ error = xfs_setfilesize(ip, offset, size);
done:
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
@@ -528,12 +529,44 @@ xfs_vm_readahead(
}
static int
-xfs_iomap_swapfile_activate(
+xfs_vm_swap_activate(
struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+
+ /*
+ * Swap file activation can race against concurrent shared extent
+ * removal in files that have been cloned. If this happens,
+ * iomap_swapfile_iter() can fail because it encountered a shared
+ * extent even though an operation is in progress to remove those
+ * shared extents.
+ *
+ * This race becomes problematic when we defer extent removal
+ * operations beyond the end of a syscall (i.e. use async background
+ * processing algorithms). Users think the extents are no longer
+ * shared, but iomap_swapfile_iter() still sees them as shared
+ * because the refcountbt entries for the extents being removed have
+ * not yet been updated. Hence the swapon call fails unexpectedly.
+ *
+ * The race condition is currently most obvious from the unlink()
+ * operation as extent removal is deferred until after the last
+ * reference to the inode goes away. We then process the extent
+ * removal asynchronously, hence triggers the "syscall completed but
+ * work not done" condition mentioned above. To close this race
+ * window, we need to flush any pending inodegc operations to ensure
+ * they have updated the refcountbt records before we try to map the
+ * swapfile.
+ */
+ xfs_inodegc_flush(ip->i_mount);
+
+ /*
+ * Direct the swap code to the correct block device when this file
+ * sits on the RT device.
+ */
+ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
return iomap_swapfile_activate(sis, swap_file, span,
&xfs_read_iomap_ops);
}
@@ -549,11 +582,11 @@ const struct address_space_operations xfs_address_space_operations = {
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.dirty_folio = noop_dirty_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};