summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarlos Maiolino <cem@kernel.org>2026-03-23 13:11:13 +0300
committerCarlos Maiolino <cem@kernel.org>2026-03-23 13:11:13 +0300
commitdf236c996bb4654a3e2a2358a8b40fecfbb0c6a1 (patch)
treeb05d1c4c119843d1324259697dedcc650fce6562
parente9b7a02e5859e56e11579450ded40d66626790a7 (diff)
parent388bb26b3d33de3c53a492824a4c5804151a0014 (diff)
downloadlinux-df236c996bb4654a3e2a2358a8b40fecfbb0c6a1.tar.xz
Merge branch 'xfs-7.1-merge' into for-next
Signed-off-by: Carlos Maiolino <cem@kernel.org>
-rw-r--r--fs/iomap/buffered-io.c6
-rw-r--r--fs/xfs/xfs_file.c17
-rw-r--r--fs/xfs/xfs_iomap.c146
-rw-r--r--fs/xfs/xfs_mount.c75
-rw-r--r--fs/xfs/xfs_mount.h3
5 files changed, 169 insertions, 78 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index a0c46aadb97d..3be3627d4b50 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1632,16 +1632,12 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
while ((ret = iomap_iter(&iter, ops)) > 0) {
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
- if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) &&
- srcmap->type != IOMAP_UNWRITTEN))
- return -EIO;
-
if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) &&
(srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN)) {
s64 status;
- if (range_dirty) {
+ if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) {
range_dirty = false;
status = iomap_zero_iter_flush_and_stale(&iter);
} else {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7918968e1d62..845a97c9b063 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1306,6 +1306,23 @@ xfs_falloc_insert_range(
if (offset >= isize)
return -EINVAL;
+ /*
+ * Let writeback clean up EOF folio state before we bump i_size. The
+ * insert flushes before it starts shifting and under certain
+ * circumstances we can write back blocks that should technically be
+ * considered post-eof (and thus should not be submitted for writeback).
+ *
+ * For example, a large, dirty folio that spans EOF and is backed by
+ * post-eof COW fork preallocation can cause block remap into the data
+ * fork. This shifts back out beyond EOF, but creates an expectedly
+ * written post-eof block. The insert is going to flush, unmap and
+ * cancel prealloc across this whole range, so flush EOF now before we
+ * bump i_size to provide consistent behavior.
+ */
+ error = filemap_write_and_wait_range(inode->i_mapping, isize, isize);
+ if (error)
+ return error;
+
error = xfs_falloc_setsize(file, isize + len);
if (error)
return error;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index be86d43044df..dbd49e838889 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1590,6 +1590,7 @@ xfs_zoned_buffered_write_iomap_begin(
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
+ struct address_space *mapping = inode->i_mapping;
struct xfs_zone_alloc_ctx *ac = iter->private;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1614,6 +1615,7 @@ xfs_zoned_buffered_write_iomap_begin(
if (error)
return error;
+restart:
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -1651,14 +1653,6 @@ xfs_zoned_buffered_write_iomap_begin(
&smap))
smap.br_startoff = end_fsb; /* fake hole until EOF */
if (smap.br_startoff > offset_fsb) {
- /*
- * We never need to allocate blocks for zeroing a hole.
- */
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb,
- smap.br_startoff);
- goto out_unlock;
- }
end_fsb = min(end_fsb, smap.br_startoff);
} else {
end_fsb = min(end_fsb,
@@ -1691,6 +1685,33 @@ xfs_zoned_buffered_write_iomap_begin(
XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
/*
+ * When zeroing, don't allocate blocks for holes as they are already
+ * zeroes, but we need to ensure that no extents exist in both the data
+ * and COW fork to ensure this really is a hole.
+ *
+ * A window exists where we might observe a hole in both forks with
+ * valid data in cache. Writeback removes the COW fork blocks on
+ * submission but doesn't remap into the data fork until completion. If
+ * the data fork was previously a hole, we'll fail to zero. Until we
+ * find a way to avoid this transient state, check for dirty pagecache
+ * and flush to wait on blocks to land in the data fork.
+ */
+ if ((flags & IOMAP_ZERO) && srcmap->type == IOMAP_HOLE) {
+ if (filemap_range_needs_writeback(mapping, offset,
+ offset + count - 1)) {
+ xfs_iunlock(ip, lockmode);
+ error = filemap_write_and_wait_range(mapping, offset,
+ offset + count - 1);
+ if (error)
+ return error;
+ goto restart;
+ }
+
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, end_fsb);
+ goto out_unlock;
+ }
+
+ /*
* The block reservation is supposed to cover all blocks that the
* operation could possible write, but there is a nasty corner case
* where blocks could be stolen from underneath us:
@@ -1764,6 +1785,8 @@ xfs_buffered_write_iomap_begin(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF;
+ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
struct xfs_bmbt_irec imap, cmap;
struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
@@ -1808,30 +1831,96 @@ xfs_buffered_write_iomap_begin(
goto out_unlock;
/*
- * Search the data fork first to look up our source mapping. We
- * always need the data fork map, as we have to return it to the
- * iomap code so that the higher level write code can read data in to
- * perform read-modify-write cycles for unaligned writes.
+ * Search the data fork first to look up our source mapping. We always
+ * need the data fork map, as we have to return it to the iomap code so
+ * that the higher level write code can read data in to perform
+ * read-modify-write cycles for unaligned writes.
+ *
+ * Then search the COW fork extent list even if we did not find a data
+ * fork extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
*/
eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
imap.br_startoff = end_fsb; /* fake hole until the end */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof)
+ cow_fsb = cmap.br_startoff;
+ }
- /* We never need to allocate blocks for zeroing or unsharing a hole. */
- if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) &&
- imap.br_startoff > offset_fsb) {
+ /* We never need to allocate blocks for unsharing a hole. */
+ if ((flags & IOMAP_UNSHARE) && imap.br_startoff > offset_fsb) {
xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
goto out_unlock;
}
/*
+ * We may need to zero over a hole in the data fork if it's fronted by
+ * COW blocks and dirty pagecache. Scan such file ranges for dirty
+ * cache and fill the iomap batch with folios that need zeroing.
+ */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ loff_t start, end;
+ unsigned int fbatch_count;
+
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+
+ if (cow_fsb == NULLFILEOFF)
+ goto found_imap;
+ if (cow_fsb > offset_fsb) {
+ xfs_trim_extent(&imap, offset_fsb,
+ cow_fsb - offset_fsb);
+ goto found_imap;
+ }
+
+ /* no zeroing beyond eof, so split at the boundary */
+ if (offset_fsb >= eof_fsb)
+ goto found_imap;
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
+ xfs_trim_extent(&imap, offset_fsb,
+ eof_fsb - offset_fsb);
+
+ /* COW fork blocks overlap the hole */
+ xfs_trim_extent(&imap, offset_fsb,
+ cmap.br_startoff + cmap.br_blockcount - offset_fsb);
+ start = XFS_FSB_TO_B(mp, imap.br_startoff);
+ end = XFS_FSB_TO_B(mp, imap.br_startoff + imap.br_blockcount);
+ fbatch_count = iomap_fill_dirty_folios(iter, &start, end,
+ &iomap_flags);
+ xfs_trim_extent(&imap, offset_fsb,
+ XFS_B_TO_FSB(mp, start) - offset_fsb);
+
+ /*
+ * Report the COW mapping if we have folios to zero. Otherwise
+ * ignore the COW blocks as preallocation and report a hole.
+ */
+ if (fbatch_count) {
+ xfs_trim_extent(&cmap, imap.br_startoff,
+ imap.br_blockcount);
+ imap.br_startoff = end_fsb; /* fake hole */
+ goto found_cow;
+ }
+ goto found_imap;
+ }
+
+ /*
* For zeroing, trim extents that extend beyond the EOF block. If a
* delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
if (flags & IOMAP_ZERO) {
- xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
-
if (isnullstartblock(imap.br_startblock) &&
offset_fsb >= eof_fsb)
goto convert_delay;
@@ -1864,24 +1953,13 @@ xfs_buffered_write_iomap_begin(
}
/*
- * Search the COW fork extent list even if we did not find a data fork
- * extent. This serves two purposes: first this implements the
- * speculative preallocation using cowextsize, so that we also unshare
- * block adjacent to shared blocks instead of just the shared blocks
- * themselves. Second the lookup in the extent list is generally faster
- * than going out to the shared extent tree.
+ * Now that we've handled any operation specific special cases, at this
+ * point we can report a COW mapping if found.
*/
- if (xfs_is_cow_inode(ip)) {
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
- cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
- &ccur, &cmap);
- if (!cow_eof && cmap.br_startoff <= offset_fsb) {
- trace_xfs_reflink_cow_found(ip, &cmap);
- goto found_cow;
- }
+ if (xfs_is_cow_inode(ip) &&
+ !cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ goto found_cow;
}
if (imap.br_startoff <= offset_fsb) {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ef1ea8a1238c..b24195f570cd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,17 +44,36 @@
#include "xfs_healthmon.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
-static int xfs_uuid_table_size;
-static uuid_t *xfs_uuid_table;
+static DEFINE_XARRAY_ALLOC(xfs_uuid_table);
+
+static uuid_t *
+xfs_uuid_search(
+ uuid_t *new_uuid)
+{
+ unsigned long index = 0;
+ uuid_t *uuid;
+
+ xa_for_each(&xfs_uuid_table, index, uuid) {
+ if (uuid_equal(uuid, new_uuid))
+ return uuid;
+ }
+ return NULL;
+}
+
+static void
+xfs_uuid_delete(
+ uuid_t *uuid,
+ unsigned int index)
+{
+ ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid));
+ xa_erase(&xfs_uuid_table, index);
+}
void
xfs_uuid_table_free(void)
{
- if (xfs_uuid_table_size == 0)
- return;
- kfree(xfs_uuid_table);
- xfs_uuid_table = NULL;
- xfs_uuid_table_size = 0;
+ ASSERT(xa_empty(&xfs_uuid_table));
+ xa_destroy(&xfs_uuid_table);
}
/*
@@ -66,7 +85,7 @@ xfs_uuid_mount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
- int hole, i;
+ int ret;
/* Publish UUID in struct super_block */
super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
@@ -80,30 +99,17 @@ xfs_uuid_mount(
}
mutex_lock(&xfs_uuid_table_mutex);
- for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
- if (uuid_is_null(&xfs_uuid_table[i])) {
- hole = i;
- continue;
- }
- if (uuid_equal(uuid, &xfs_uuid_table[i]))
- goto out_duplicate;
- }
-
- if (hole < 0) {
- xfs_uuid_table = krealloc(xfs_uuid_table,
- (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- GFP_KERNEL | __GFP_NOFAIL);
- hole = xfs_uuid_table_size++;
+ if (unlikely(xfs_uuid_search(uuid))) {
+ xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount",
+ uuid);
+ mutex_unlock(&xfs_uuid_table_mutex);
+ return -EINVAL;
}
- xfs_uuid_table[hole] = *uuid;
- mutex_unlock(&xfs_uuid_table_mutex);
- return 0;
-
- out_duplicate:
+ ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid,
+ xa_limit_32b, GFP_KERNEL);
mutex_unlock(&xfs_uuid_table_mutex);
- xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
- return -EINVAL;
+ return ret;
}
STATIC void
@@ -111,21 +117,12 @@ xfs_uuid_unmount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
- int i;
if (xfs_has_nouuid(mp))
return;
mutex_lock(&xfs_uuid_table_mutex);
- for (i = 0; i < xfs_uuid_table_size; i++) {
- if (uuid_is_null(&xfs_uuid_table[i]))
- continue;
- if (!uuid_equal(uuid, &xfs_uuid_table[i]))
- continue;
- memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
- break;
- }
- ASSERT(i < xfs_uuid_table_size);
+ xfs_uuid_delete(uuid, mp->m_uuid_table_index);
mutex_unlock(&xfs_uuid_table_mutex);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ddd4028be8d6..d964bae096ef 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -346,6 +346,9 @@ typedef struct xfs_mount {
/* Private data referring to a health monitor object. */
struct xfs_healthmon __rcu *m_healthmon;
+
+ /* Index of uuid record in the uuid xarray. */
+ unsigned int m_uuid_table_index;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)