From 6270b8ac2f41858952074b23c2d3d9aa2fe1bfa9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 26 Feb 2026 07:46:46 +0900 Subject: xfs: remove scratch field from struct xfs_gc_bio The scratch field in struct xfs_gc_bio is unused. Remove it. Fixes: 102f444b57b3 ("xfs: rework zone GC buffer management") Signed-off-by: Damien Le Moal Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 7efeecd2d85f..309f70098524 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -96,7 +96,6 @@ struct xfs_gc_bio { */ xfs_fsblock_t old_startblock; xfs_daddr_t new_daddr; - struct xfs_zone_scratch *scratch; /* Are we writing to a sequential write required zone? */ bool is_seq; @@ -779,7 +778,6 @@ xfs_zone_gc_split_write( ihold(VFS_I(chunk->ip)); split_chunk->ip = chunk->ip; split_chunk->is_seq = chunk->is_seq; - split_chunk->scratch = chunk->scratch; split_chunk->offset = chunk->offset; split_chunk->len = split_len; split_chunk->old_startblock = chunk->old_startblock; -- cgit v1.2.3 From 0ca1a8331c0fa5e57844e003a5d667a15b1e002c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Mar 2026 09:31:58 -0800 Subject: xfs: fix race between healthmon unmount and read_iter xfs/1879 on one of my test VMs got stuck due to the xfs_io healthmon subcommand sleeping in wait_event_interruptible at: xfs_healthmon_read_iter+0x558/0x5f8 [xfs] vfs_read+0x248/0x320 ksys_read+0x78/0x120 Looking at xfs_healthmon_read_iter, in !O_NONBLOCK mode it will sleep until the mount cookie == DETACHED_MOUNT_COOKIE, there are events waiting to be formatted, or there are formatted events in the read buffer that could be copied to userspace. Poking into the running kernel, I see that there are zero events in the list, the read buffer is empty, and the mount cookie is indeed in DETACHED state. IOWs, xfs_healthmon_has_eventdata should have returned true, but instead we're asleep waiting for a wakeup. I think what happened here is that xfs_healthmon_read_iter and xfs_healthmon_unmount were racing with each other, and _read_iter lost the race. _unmount queued an unmount event, which woke up _read_iter. It found, formatted, and copied the event out to userspace. That cleared out the pending event list and emptied the read buffer. xfs_io then called read() again, so _has_eventdata decided that we should sleep on the empty event queue. Next, _unmount called xfs_healthmon_detach, which set the mount cookie to DETACHED. Unfortunately, it didn't call wake_up_all on the hm, so the wait_event_interruptible in the _read_iter thread remains asleep. That's why the test stalled. Fix this by moving the wake_up_all call to xfs_healthmon_detach. Fixes: b3a289a2a9397b ("xfs: create event queuing, formatting, and discovery infrastructure") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_healthmon.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c index 4a06d6632f65..26c325d34bd1 100644 --- a/fs/xfs/xfs_healthmon.c +++ b/fs/xfs/xfs_healthmon.c @@ -141,6 +141,16 @@ xfs_healthmon_detach( hm->mount_cookie = DETACHED_MOUNT_COOKIE; spin_unlock(&xfs_healthmon_lock); + /* + * Wake up any readers that might remain. This can happen if unmount + * races with the healthmon fd owner entering ->read_iter, having + * already emptied the event queue. + * + * In the ->release case there shouldn't be any readers because the + * only users of the waiter are read and poll. + */ + wake_up_all(&hm->wait); + trace_xfs_healthmon_detach(hm); xfs_healthmon_put(hm); } @@ -1027,13 +1037,6 @@ xfs_healthmon_release( * process can create another health monitor file. */ xfs_healthmon_detach(hm); - - /* - * Wake up any readers that might be left. There shouldn't be any - * because the only users of the waiter are read and poll. - */ - wake_up_all(&hm->wait); - xfs_healthmon_put(hm); return 0; } -- cgit v1.2.3 From 281cb17787d4284a7790b9cbd80fded826ca7739 Mon Sep 17 00:00:00 2001 From: hongao Date: Wed, 4 Mar 2026 19:29:14 +0800 Subject: xfs: Remove redundant NULL check after __GFP_NOFAIL kzalloc() is called with __GFP_NOFAIL, so a NULL return is not expected. Drop the redundant !map check in xfs_dabuf_map(). Also switch the nirecs-sized allocation to kcalloc(). Signed-off-by: hongao Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_da_btree.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 766631f0562e..09d4c17b3e7b 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2716,12 +2716,8 @@ xfs_dabuf_map( * larger one that needs to be free by the caller. */ if (nirecs > 1) { - map = kzalloc(nirecs * sizeof(struct xfs_buf_map), - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - if (!map) { - error = -ENOMEM; - goto out_free_irecs; - } + map = kcalloc(nirecs, sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); *mapp = map; } -- cgit v1.2.3 From 54fcd2f95f8d216183965a370ec69e1aab14f5da Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 4 Mar 2026 19:54:27 +0100 Subject: xfs: fix returned valued from xfs_defer_can_append xfs_defer_can_append returns a bool, it shouldn't be returning a NULL. Found by code inspection. Fixes: 4dffb2cbb483 ("xfs: allow pausing of pending deferred work items") Cc: # v6.8 Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Acked-by: Souptick Joarder Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_defer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 472c261163ed..c6909716b041 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -809,7 +809,7 @@ xfs_defer_can_append( /* Paused items cannot absorb more work */ if (dfp->dfp_flags & XFS_DEFER_PAUSED) - return NULL; + return false; /* Already full? */ if (ops->max_items && dfp->dfp_count >= ops->max_items) -- cgit v1.2.3 From f1d77b863b414586ee45e10d9837c9ab27d8692d Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 5 Mar 2026 16:49:21 +0800 Subject: xfs: remove redundant set null for ip->i_itemp ip->i_itemp has been set null in xfs_inode_item_destroy(), so there is no need set it null again in xfs_inode_free_callback(). Signed-off-by: Long Li Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index a7a09e7eec81..2040a9292ee6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -159,7 +159,6 @@ xfs_inode_free_callback( ASSERT(!test_bit(XFS_LI_IN_AIL, &ip->i_itemp->ili_item.li_flags)); xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; } kmem_cache_free(xfs_inode_cache, ip); -- cgit v1.2.3 From 186ac39b8a7d3ec7ce9c5dd45e5c2730177f375c Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 5 Mar 2026 16:49:22 +0800 Subject: xfs: ensure dquot item is deleted from AIL only after log shutdown In xfs_qm_dqflush(), when a dquot flush fails due to corruption (the out_abort error path), the original code removed the dquot log item from the AIL before calling xfs_force_shutdown(). This ordering introduces a subtle race condition that can lead to data loss after a crash. The AIL tracks the oldest dirty metadata in the journal. The position of the tail item in the AIL determines the log tail LSN, which is the oldest LSN that must be preserved for crash recovery. When an item is removed from the AIL, the log tail can advance past the LSN of that item. The race window is as follows: if the dquot item happens to be at the tail of the log, removing it from the AIL allows the log tail to advance. If a concurrent log write is sampling the tail LSN at the same time and subsequently writes a complete checkpoint (i.e., one containing a commit record) to disk before the shutdown takes effect, the journal will no longer protect the dquot's last modification. On the next mount, log recovery will not replay the dquot changes, even though they were never written back to disk, resulting in silent data loss. Fix this by calling xfs_force_shutdown() before xfs_trans_ail_delete() in the out_abort path. Once the log is shut down, no new log writes can complete with an updated tail LSN, making it safe to remove the dquot item from the AIL. Cc: stable@vger.kernel.org Fixes: b707fffda6a3 ("xfs: abort consistently on dquot flush failure") Signed-off-by: Long Li Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 2b208e2c5264..69e9bc588c8b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1439,9 +1439,15 @@ xfs_qm_dqflush( return 0; out_abort: + /* + * Shut down the log before removing the dquot item from the AIL. + * Otherwise, the log tail may advance past this item's LSN while + * log writes are still in progress, making these unflushed changes + * unrecoverable on the next mount. + */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_dqfunlock(dqp); return error; } -- cgit v1.2.3 From 52a8a1ba883defbfe3200baa22cf4cd21985d51a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 4 Mar 2026 20:26:20 -0800 Subject: xfs: fix undersized l_iclog_roundoff values If the superblock doesn't list a log stripe unit, we set the incore log roundoff value to 512. This leads to corrupt logs and unmountable filesystems in generic/617 on a disk with 4k physical sectors... XFS (sda1): Mounting V5 Filesystem ff3121ca-26e6-4b77-b742-aaff9a449e1c XFS (sda1): Torn write (CRC failure) detected at log block 0x318e. Truncating head block from 0x3197. XFS (sda1): failed to locate log tail XFS (sda1): log mount/recovery failed: error -74 XFS (sda1): log mount failed XFS (sda1): Mounting V5 Filesystem ff3121ca-26e6-4b77-b742-aaff9a449e1c XFS (sda1): Ending clean mount ...on the current xfsprogs for-next which has a broken mkfs. xfs_info shows this... meta-data=/dev/sda1 isize=512 agcount=4, agsize=644992 blks = sectsz=4096 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=1 = reflink=1 bigtime=1 inobtcount=1 nrext64=1 = exchange=1 metadir=1 data = bsize=4096 blocks=2579968, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0, ftype=1, parent=1 log =internal log bsize=4096 blocks=16384, version=2 = sectsz=4096 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 = rgcount=0 rgsize=268435456 extents = zoned=0 start=0 reserved=0 ...observe that the log section has sectsz=4096 sunit=0, which means that the roundoff factor is 512, not 4096 as you'd expect. We should fix mkfs not to generate broken filesystems, but anyone can fuzz the ondisk superblock so we should be more cautious. I think the inadequate logic predates commit a6a65fef5ef8d0, but that's clearly going to require a different backport. Cc: stable@vger.kernel.org # v5.14 Fixes: a6a65fef5ef8d0 ("xfs: log stripe roundoff is a property of the log") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b96f262ba139..f807f8f4f705 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1357,6 +1357,8 @@ xlog_alloc_log( if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) log->l_iclog_roundoff = mp->m_sb.sb_logsunit; + else if (mp->m_sb.sb_logsectsize > 0) + log->l_iclog_roundoff = mp->m_sb.sb_logsectsize; else log->l_iclog_roundoff = BBSIZE; -- cgit v1.2.3 From 362c490980867930a098b99f421268fbd7ca05fd Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 10 Mar 2026 20:32:33 +0800 Subject: xfs: fix integer overflow in bmap intent sort comparator xfs_bmap_update_diff_items() sorts bmap intents by inode number using a subtraction of two xfs_ino_t (uint64_t) values, with the result truncated to int. This is incorrect when two inode numbers differ by more than INT_MAX (2^31 - 1), which is entirely possible on large XFS filesystems. Fix this by replacing the subtraction with cmp_int(). Cc: # v4.9 Fixes: 9f3afb57d5f1 ("xfs: implement deferred bmbt map/unmap operations") Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_bmap_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e8775f254c89..b237a25d6045 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -245,7 +245,7 @@ xfs_bmap_update_diff_items( struct xfs_bmap_intent *ba = bi_entry(a); struct xfs_bmap_intent *bb = bi_entry(b); - return ba->bi_owner->i_ino - bb->bi_owner->i_ino; + return cmp_int(ba->bi_owner->i_ino, bb->bi_owner->i_ino); } /* Log bmap updates in the intent item. */ -- cgit v1.2.3