diff options
| author | Carlos Maiolino <cem@kernel.org> | 2026-03-23 13:11:13 +0300 |
|---|---|---|
| committer | Carlos Maiolino <cem@kernel.org> | 2026-03-23 13:11:13 +0300 |
| commit | df236c996bb4654a3e2a2358a8b40fecfbb0c6a1 (patch) | |
| tree | b05d1c4c119843d1324259697dedcc650fce6562 | |
| parent | e9b7a02e5859e56e11579450ded40d66626790a7 (diff) | |
| parent | 388bb26b3d33de3c53a492824a4c5804151a0014 (diff) | |
| download | linux-df236c996bb4654a3e2a2358a8b40fecfbb0c6a1.tar.xz | |
Merge branch 'xfs-7.1-merge' into for-next
Signed-off-by: Carlos Maiolino <cem@kernel.org>
| -rw-r--r-- | fs/iomap/buffered-io.c | 6 | ||||
| -rw-r--r-- | fs/xfs/xfs_file.c | 17 | ||||
| -rw-r--r-- | fs/xfs/xfs_iomap.c | 146 | ||||
| -rw-r--r-- | fs/xfs/xfs_mount.c | 75 | ||||
| -rw-r--r-- | fs/xfs/xfs_mount.h | 3 |
5 files changed, 169 insertions, 78 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index a0c46aadb97d..3be3627d4b50 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1632,16 +1632,12 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && - srcmap->type != IOMAP_UNWRITTEN)) - return -EIO; - if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)) { s64 status; - if (range_dirty) { + if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) { range_dirty = false; status = iomap_zero_iter_flush_and_stale(&iter); } else { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7918968e1d62..845a97c9b063 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1306,6 +1306,23 @@ xfs_falloc_insert_range( if (offset >= isize) return -EINVAL; + /* + * Let writeback clean up EOF folio state before we bump i_size. The + * insert flushes before it starts shifting and under certain + * circumstances we can write back blocks that should technically be + * considered post-eof (and thus should not be submitted for writeback). + * + * For example, a large, dirty folio that spans EOF and is backed by + * post-eof COW fork preallocation can cause block remap into the data + * fork. This shifts back out beyond EOF, but creates an expectedly + * written post-eof block. The insert is going to flush, unmap and + * cancel prealloc across this whole range, so flush EOF now before we + * bump i_size to provide consistent behavior. + */ + error = filemap_write_and_wait_range(inode->i_mapping, isize, isize); + if (error) + return error; + error = xfs_falloc_setsize(file, isize + len); if (error) return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index be86d43044df..dbd49e838889 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1590,6 +1590,7 @@ xfs_zoned_buffered_write_iomap_begin( { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct address_space *mapping = inode->i_mapping; struct xfs_zone_alloc_ctx *ac = iter->private; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1614,6 +1615,7 @@ xfs_zoned_buffered_write_iomap_begin( if (error) return error; +restart: error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -1651,14 +1653,6 @@ xfs_zoned_buffered_write_iomap_begin( &smap)) smap.br_startoff = end_fsb; /* fake hole until EOF */ if (smap.br_startoff > offset_fsb) { - /* - * We never need to allocate blocks for zeroing a hole. - */ - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, - smap.br_startoff); - goto out_unlock; - } end_fsb = min(end_fsb, smap.br_startoff); } else { end_fsb = min(end_fsb, @@ -1691,6 +1685,33 @@ xfs_zoned_buffered_write_iomap_begin( XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); /* + * When zeroing, don't allocate blocks for holes as they are already + * zeroes, but we need to ensure that no extents exist in both the data + * and COW fork to ensure this really is a hole. + * + * A window exists where we might observe a hole in both forks with + * valid data in cache. Writeback removes the COW fork blocks on + * submission but doesn't remap into the data fork until completion. If + * the data fork was previously a hole, we'll fail to zero. Until we + * find a way to avoid this transient state, check for dirty pagecache + * and flush to wait on blocks to land in the data fork. + */ + if ((flags & IOMAP_ZERO) && srcmap->type == IOMAP_HOLE) { + if (filemap_range_needs_writeback(mapping, offset, + offset + count - 1)) { + xfs_iunlock(ip, lockmode); + error = filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + if (error) + return error; + goto restart; + } + + xfs_hole_to_iomap(ip, iomap, offset_fsb, end_fsb); + goto out_unlock; + } + + /* * The block reservation is supposed to cover all blocks that the * operation could possible write, but there is a nasty corner case * where blocks could be stolen from underneath us: @@ -1764,6 +1785,8 @@ xfs_buffered_write_iomap_begin( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + xfs_fileoff_t cow_fsb = NULLFILEOFF; + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); struct xfs_bmbt_irec imap, cmap; struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; @@ -1808,30 +1831,96 @@ xfs_buffered_write_iomap_begin( goto out_unlock; /* - * Search the data fork first to look up our source mapping. We - * always need the data fork map, as we have to return it to the - * iomap code so that the higher level write code can read data in to - * perform read-modify-write cycles for unaligned writes. + * Search the data fork first to look up our source mapping. We always + * need the data fork map, as we have to return it to the iomap code so + * that the higher level write code can read data in to perform + * read-modify-write cycles for unaligned writes. + * + * Then search the COW fork extent list even if we did not find a data + * fork extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. */ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) imap.br_startoff = end_fsb; /* fake hole until the end */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof) + cow_fsb = cmap.br_startoff; + } - /* We never need to allocate blocks for zeroing or unsharing a hole. */ - if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) && - imap.br_startoff > offset_fsb) { + /* We never need to allocate blocks for unsharing a hole. */ + if ((flags & IOMAP_UNSHARE) && imap.br_startoff > offset_fsb) { xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); goto out_unlock; } /* + * We may need to zero over a hole in the data fork if it's fronted by + * COW blocks and dirty pagecache. Scan such file ranges for dirty + * cache and fill the iomap batch with folios that need zeroing. + */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + loff_t start, end; + unsigned int fbatch_count; + + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + + if (cow_fsb == NULLFILEOFF) + goto found_imap; + if (cow_fsb > offset_fsb) { + xfs_trim_extent(&imap, offset_fsb, + cow_fsb - offset_fsb); + goto found_imap; + } + + /* no zeroing beyond eof, so split at the boundary */ + if (offset_fsb >= eof_fsb) + goto found_imap; + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) + xfs_trim_extent(&imap, offset_fsb, + eof_fsb - offset_fsb); + + /* COW fork blocks overlap the hole */ + xfs_trim_extent(&imap, offset_fsb, + cmap.br_startoff + cmap.br_blockcount - offset_fsb); + start = XFS_FSB_TO_B(mp, imap.br_startoff); + end = XFS_FSB_TO_B(mp, imap.br_startoff + imap.br_blockcount); + fbatch_count = iomap_fill_dirty_folios(iter, &start, end, + &iomap_flags); + xfs_trim_extent(&imap, offset_fsb, + XFS_B_TO_FSB(mp, start) - offset_fsb); + + /* + * Report the COW mapping if we have folios to zero. Otherwise + * ignore the COW blocks as preallocation and report a hole. + */ + if (fbatch_count) { + xfs_trim_extent(&cmap, imap.br_startoff, + imap.br_blockcount); + imap.br_startoff = end_fsb; /* fake hole */ + goto found_cow; + } + goto found_imap; + } + + /* * For zeroing, trim extents that extend beyond the EOF block. If a * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ if (flags & IOMAP_ZERO) { - xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - if (isnullstartblock(imap.br_startblock) && offset_fsb >= eof_fsb) goto convert_delay; @@ -1864,24 +1953,13 @@ xfs_buffered_write_iomap_begin( } /* - * Search the COW fork extent list even if we did not find a data fork - * extent. This serves two purposes: first this implements the - * speculative preallocation using cowextsize, so that we also unshare - * block adjacent to shared blocks instead of just the shared blocks - * themselves. Second the lookup in the extent list is generally faster - * than going out to the shared extent tree. + * Now that we've handled any operation specific special cases, at this + * point we can report a COW mapping if found. */ - if (xfs_is_cow_inode(ip)) { - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, - &ccur, &cmap); - if (!cow_eof && cmap.br_startoff <= offset_fsb) { - trace_xfs_reflink_cow_found(ip, &cmap); - goto found_cow; - } + if (xfs_is_cow_inode(ip) && + !cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + goto found_cow; } if (imap.br_startoff <= offset_fsb) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ef1ea8a1238c..b24195f570cd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,17 +44,36 @@ #include "xfs_healthmon.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); -static int xfs_uuid_table_size; -static uuid_t *xfs_uuid_table; +static DEFINE_XARRAY_ALLOC(xfs_uuid_table); + +static uuid_t * +xfs_uuid_search( + uuid_t *new_uuid) +{ + unsigned long index = 0; + uuid_t *uuid; + + xa_for_each(&xfs_uuid_table, index, uuid) { + if (uuid_equal(uuid, new_uuid)) + return uuid; + } + return NULL; +} + +static void +xfs_uuid_delete( + uuid_t *uuid, + unsigned int index) +{ + ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid)); + xa_erase(&xfs_uuid_table, index); +} void xfs_uuid_table_free(void) { - if (xfs_uuid_table_size == 0) - return; - kfree(xfs_uuid_table); - xfs_uuid_table = NULL; - xfs_uuid_table_size = 0; + ASSERT(xa_empty(&xfs_uuid_table)); + xa_destroy(&xfs_uuid_table); } /* @@ -66,7 +85,7 @@ xfs_uuid_mount( struct xfs_mount *mp) { uuid_t *uuid = &mp->m_sb.sb_uuid; - int hole, i; + int ret; /* Publish UUID in struct super_block */ super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); @@ -80,30 +99,17 @@ xfs_uuid_mount( } mutex_lock(&xfs_uuid_table_mutex); - for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { - if (uuid_is_null(&xfs_uuid_table[i])) { - hole = i; - continue; - } - if (uuid_equal(uuid, &xfs_uuid_table[i])) - goto out_duplicate; - } - - if (hole < 0) { - xfs_uuid_table = krealloc(xfs_uuid_table, - (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - GFP_KERNEL | __GFP_NOFAIL); - hole = xfs_uuid_table_size++; + if (unlikely(xfs_uuid_search(uuid))) { + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", + uuid); + mutex_unlock(&xfs_uuid_table_mutex); + return -EINVAL; } - xfs_uuid_table[hole] = *uuid; - mutex_unlock(&xfs_uuid_table_mutex); - return 0; - - out_duplicate: + ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid, + xa_limit_32b, GFP_KERNEL); mutex_unlock(&xfs_uuid_table_mutex); - xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); - return -EINVAL; + return ret; } STATIC void @@ -111,21 +117,12 @@ xfs_uuid_unmount( struct xfs_mount *mp) { uuid_t *uuid = &mp->m_sb.sb_uuid; - int i; if (xfs_has_nouuid(mp)) return; mutex_lock(&xfs_uuid_table_mutex); - for (i = 0; i < xfs_uuid_table_size; i++) { - if (uuid_is_null(&xfs_uuid_table[i])) - continue; - if (!uuid_equal(uuid, &xfs_uuid_table[i])) - continue; - memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); - break; - } - ASSERT(i < xfs_uuid_table_size); + xfs_uuid_delete(uuid, mp->m_uuid_table_index); mutex_unlock(&xfs_uuid_table_mutex); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ddd4028be8d6..d964bae096ef 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -346,6 +346,9 @@ typedef struct xfs_mount { /* Private data referring to a health monitor object. */ struct xfs_healthmon __rcu *m_healthmon; + + /* Index of uuid record in the uuid xarray. */ + unsigned int m_uuid_table_index; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) |
